# Data Cleaning & Transformation
import pandas as pd
import datatable as dt
pd.set_option('display.max.columns', 500)
pd.set_option('display.max.columns', 500)
import numpy as np
# Data Understanding
from ydata_profiling import ProfileReport
# Data Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Machine Learning
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
# Hyperparameters Optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# Ensemble Learning
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")
Function Purpose: Due to the large number of records especially after one-hot encoding, a way to reduce the memory is to ensure that the data type nicely fits the min-max of the variables
def reduce_memory_usage(df, verbose=True):
numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
start_mem = df.memory_usage().sum() / 1024 ** 2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == "int":
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: #-128 to 128
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: #-32768 to 32768
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: #-2147483648 to 2147483648
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: #-9223372036854775808 to 9223372036854775808
df[col] = df[col].astype(np.int64)
else:
if (
c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max #-65500 to 65500
):
df[col] = df[col].astype(np.float16)
elif (
c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max #-3.40e+38 to -3.40e+38
):
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64) #-1.80e+308 to 1.80e+308
end_mem = df.memory_usage().sum() / 1024 ** 2
if verbose:
print(
"Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
end_mem, 100 * (start_mem - end_mem) / start_mem
)
)
return df
# train_df = dt.fread("data/Loan_status_2007-2020Q3.csv").to_pandas()
train_df = pd.read_csv("data/Loan_status_2007-2020Q3.csv")
train_df
| id | loan_amnt | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | purpose | title | zip_code | dti | earliest_cr_line | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | application_type | mort_acc | pub_rec_bankruptcies | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1077501 | 5000.0 | 36 months | 10.65% | 162.87 | B | B2 | NaN | 10+ years | RENT | 24000.0 | Verified | Dec-2011 | Fully Paid | credit_card | Computer | 860xx | 27.65 | Jan-1985 | 3.0 | 0.0 | 13648.0 | 83.7% | 9.0 | f | Individual | NaN | 0.0 |
| 1 | 1077430 | 2500.0 | 60 months | 15.27% | 59.83 | C | C4 | Ryder | < 1 year | RENT | 30000.0 | Source Verified | Dec-2011 | Charged Off | car | bike | 309xx | 1.00 | Apr-1999 | 3.0 | 0.0 | 1687.0 | 9.4% | 4.0 | f | Individual | NaN | 0.0 |
| 2 | 1077175 | 2400.0 | 36 months | 15.96% | 84.33 | C | C5 | NaN | 10+ years | RENT | 12252.0 | Not Verified | Dec-2011 | Fully Paid | small_business | real estate business | 606xx | 8.72 | Nov-2001 | 2.0 | 0.0 | 2956.0 | 98.5% | 10.0 | f | Individual | NaN | 0.0 |
| 3 | 1076863 | 10000.0 | 36 months | 13.49% | 339.31 | C | C1 | AIR RESOURCES BOARD | 10+ years | RENT | 49200.0 | Source Verified | Dec-2011 | Fully Paid | other | personel | 917xx | 20.00 | Feb-1996 | 10.0 | 0.0 | 5598.0 | 21% | 37.0 | f | Individual | NaN | 0.0 |
| 4 | 1075358 | 3000.0 | 60 months | 12.69% | 67.79 | B | B5 | University Medical Group | 1 year | RENT | 80000.0 | Source Verified | Dec-2011 | Fully Paid | other | Personal | 972xx | 17.94 | Jan-1996 | 15.0 | 0.0 | 27783.0 | 53.9% | 38.0 | f | Individual | NaN | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2925488 | 102556443 | 24000.0 | 60 months | 23.99% | 690.30 | E | E2 | Senior Escrow Officer | < 1 year | RENT | 107000.0 | Source Verified | Apr-2017 | Charged Off | other | Other | 801xx | 11.65 | Apr-1995 | 13.0 | 2.0 | 9688.0 | 24.9% | 24.0 | f | Individual | 0.0 | 1.0 |
| 2925489 | 102653304 | 10000.0 | 36 months | 7.99% | 313.32 | A | A5 | Rn | 10+ years | MORTGAGE | 65000.0 | Source Verified | Apr-2017 | Fully Paid | debt_consolidation | Debt consolidation | 187xx | 19.55 | Sep-1993 | 15.0 | 0.0 | 9751.0 | 15.7% | 27.0 | w | Individual | 0.0 | 0.0 |
| 2925490 | 102628603 | 10050.0 | 36 months | 16.99% | 358.26 | D | D1 | Sales Associate | 8 years | RENT | 37000.0 | Not Verified | Apr-2017 | Charged Off | debt_consolidation | Debt consolidation | 220xx | 20.56 | May-1993 | 15.0 | 0.0 | 14300.0 | 47% | 21.0 | w | Individual | 0.0 | 0.0 |
| 2925491 | 102196576 | 6000.0 | 36 months | 11.44% | 197.69 | B | B4 | Contact input | 5 years | RENT | 41000.0 | Source Verified | Apr-2017 | Fully Paid | credit_card | Credit card refinancing | 105xx | 19.99 | May-1990 | 9.0 | 0.0 | 1356.0 | 10.1% | 18.0 | w | Individual | 0.0 | 0.0 |
| 2925492 | 99799684 | 30000.0 | 60 months | 25.49% | 889.18 | E | E4 | Assistant Manager | 4 years | MORTGAGE | 105700.0 | Verified | Apr-2017 | Charged Off | debt_consolidation | Debt consolidation | 797xx | 27.26 | Nov-1997 | 12.0 | 0.0 | 15252.0 | 72.6% | 23.0 | w | Individual | 3.0 | 0.0 |
2925493 rows × 28 columns
Note: Not ideal for datasets with too many columns and rows
# profile = ProfileReport(train_df)
# profile.to_notebook_iframe()
train_df.info(verbose=True, show_counts=True)
# verbose: Whether to print the full summary
# show_counts: Whether to show the non-null counts. By default, this is shown only if the DataFrame is small
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2925493 entries, 0 to 2925492 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 2925493 non-null object 1 loan_amnt 2925492 non-null float64 2 term 2925492 non-null object 3 int_rate 2925492 non-null object 4 installment 2925492 non-null float64 5 grade 2925492 non-null object 6 sub_grade 2925492 non-null object 7 emp_title 2661406 non-null object 8 emp_length 2720272 non-null object 9 home_ownership 2925492 non-null object 10 annual_inc 2925488 non-null float64 11 verification_status 2925492 non-null object 12 issue_d 2925492 non-null object 13 loan_status 2925492 non-null object 14 purpose 2925492 non-null object 15 title 2902167 non-null object 16 zip_code 2925491 non-null object 17 dti 2922384 non-null float64 18 earliest_cr_line 2925463 non-null object 19 open_acc 2925463 non-null float64 20 pub_rec 2925463 non-null float64 21 revol_bal 2925492 non-null float64 22 revol_util 2922832 non-null object 23 total_acc 2925463 non-null float64 24 initial_list_status 2925492 non-null object 25 application_type 2925492 non-null object 26 mort_acc 2875462 non-null float64 27 pub_rec_bankruptcies 2924127 non-null float64 dtypes: float64(10), object(18) memory usage: 625.0+ MB
Data Type
</br>int_rate, issue_d, earliest_cr_line & revol_util have incorrect data types
</br> Missing Values
</br>Almost all columns except id have missing values.
</br> It is not easy to find out the number or percentage of missing values in each column, hence using for loop for better illustration
full = []
for col in train_df.columns.tolist():
if train_df[col].isnull().sum() != 0:
total = train_df[col].isnull().sum()
percent = round(train_df[col].isnull().sum()/len(train_df)*100, 2)
ls = [col, total, percent]
full.append(ls)
else:
pass
missing = pd.DataFrame(full, columns = ['Variable', 'No. of Missing Values', 'Percent'])
missing
| Variable | No. of Missing Values | Percent | |
|---|---|---|---|
| 0 | loan_amnt | 1 | 0.00 |
| 1 | term | 1 | 0.00 |
| 2 | int_rate | 1 | 0.00 |
| 3 | installment | 1 | 0.00 |
| 4 | grade | 1 | 0.00 |
| 5 | sub_grade | 1 | 0.00 |
| 6 | emp_title | 264087 | 9.03 |
| 7 | emp_length | 205221 | 7.01 |
| 8 | home_ownership | 1 | 0.00 |
| 9 | annual_inc | 5 | 0.00 |
| 10 | verification_status | 1 | 0.00 |
| 11 | issue_d | 1 | 0.00 |
| 12 | loan_status | 1 | 0.00 |
| 13 | purpose | 1 | 0.00 |
| 14 | title | 23326 | 0.80 |
| 15 | zip_code | 2 | 0.00 |
| 16 | dti | 3109 | 0.11 |
| 17 | earliest_cr_line | 30 | 0.00 |
| 18 | open_acc | 30 | 0.00 |
| 19 | pub_rec | 30 | 0.00 |
| 20 | revol_bal | 1 | 0.00 |
| 21 | revol_util | 2661 | 0.09 |
| 22 | total_acc | 30 | 0.00 |
| 23 | initial_list_status | 1 | 0.00 |
| 24 | application_type | 1 | 0.00 |
| 25 | mort_acc | 50031 | 1.71 |
| 26 | pub_rec_bankruptcies | 1366 | 0.05 |
emp_title & emp_length have the highest number of missing values. emp_title may be difficult to impute due to its high cardinality by nature# Categorical Variables
train_df.describe(include = 'object').T
| count | unique | top | freq | |
|---|---|---|---|---|
| id | 2925493 | 2925493 | 1077501 | 1 |
| term | 2925492 | 2 | 36 months | 2060077 |
| int_rate | 2925492 | 704 | 8.19% | 68199 |
| grade | 2925492 | 7 | B | 857573 |
| sub_grade | 2925492 | 35 | B4 | 185188 |
| emp_title | 2661406 | 590413 | Teacher | 50103 |
| emp_length | 2720272 | 11 | 10+ years | 946268 |
| home_ownership | 2925492 | 6 | MORTGAGE | 1437859 |
| verification_status | 2925492 | 3 | Source Verified | 1143247 |
| issue_d | 2925492 | 160 | Mar-2016 | 61992 |
| loan_status | 2925492 | 10 | Fully Paid | 1497783 |
| purpose | 2925492 | 14 | debt_consolidation | 1638058 |
| title | 2902167 | 63155 | Debt consolidation | 1513474 |
| zip_code | 2925491 | 962 | 112xx | 30473 |
| earliest_cr_line | 2925463 | 777 | Sep-2005 | 20089 |
| revol_util | 2922832 | 1443 | 0% | 18856 |
| initial_list_status | 2925492 | 2 | w | 2139434 |
| application_type | 2925492 | 2 | Individual | 2714979 |
# Numeric Variables
train_df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| loan_amnt | 2925492.0 | 15358.775105 | 9478.383451 | 500.00 | 8000.00 | 13000.0 | 20000.00 | 4.000000e+04 |
| installment | 2925492.0 | 452.391358 | 272.947849 | 4.93 | 253.50 | 382.3 | 603.92 | 1.719830e+03 |
| annual_inc | 2925488.0 | 79937.274015 | 111747.608711 | 0.00 | 47000.00 | 66000.0 | 95000.00 | 1.100000e+08 |
| dti | 2922384.0 | 19.296480 | 15.719773 | -1.00 | 12.08 | 18.1 | 24.88 | 9.990000e+02 |
| open_acc | 2925463.0 | 11.676163 | 5.733312 | 0.00 | 8.00 | 11.0 | 15.00 | 1.040000e+02 |
| pub_rec | 2925463.0 | 0.176285 | 0.524376 | 0.00 | 0.00 | 0.0 | 0.00 | 8.600000e+01 |
| revol_bal | 2925492.0 | 16964.840934 | 22996.213652 | 0.00 | 5993.00 | 11493.0 | 20644.00 | 2.904836e+06 |
| total_acc | 2925463.0 | 24.018350 | 12.076252 | 1.00 | 15.00 | 22.0 | 31.00 | 1.760000e+02 |
| mort_acc | 2875462.0 | 1.512336 | 1.869776 | 0.00 | 0.00 | 1.0 | 2.00 | 9.400000e+01 |
| pub_rec_bankruptcies | 2924127.0 | 0.122679 | 0.353141 | 0.00 | 0.00 | 0.0 | 0.00 | 1.200000e+01 |
if len(train_df[train_df.duplicated('id', keep = False) == True]) == 0:
print('No Duplicates')
else:
raise Exception('Duplicated ID records')
No Duplicates
train_df[train_df['int_rate'].isnull() == True]
| id | loan_amnt | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | purpose | title | zip_code | dti | earliest_cr_line | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | application_type | mort_acc | pub_rec_bankruptcies | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 39786 | Loans that do not meet the credit policy | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# Convert int_rate from str to float
train_df2 = train_df.copy()
train_df2['int_rate'] = train_df2['int_rate'].str[:-1]
train_df2['int_rate'] = train_df2['int_rate'].astype(float)
train_df2
| id | loan_amnt | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | purpose | title | zip_code | dti | earliest_cr_line | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | application_type | mort_acc | pub_rec_bankruptcies | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1077501 | 5000.0 | 36 months | 10.65 | 162.87 | B | B2 | NaN | 10+ years | RENT | 24000.0 | Verified | Dec-2011 | Fully Paid | credit_card | Computer | 860xx | 27.65 | Jan-1985 | 3.0 | 0.0 | 13648.0 | 83.7% | 9.0 | f | Individual | NaN | 0.0 |
| 1 | 1077430 | 2500.0 | 60 months | 15.27 | 59.83 | C | C4 | Ryder | < 1 year | RENT | 30000.0 | Source Verified | Dec-2011 | Charged Off | car | bike | 309xx | 1.00 | Apr-1999 | 3.0 | 0.0 | 1687.0 | 9.4% | 4.0 | f | Individual | NaN | 0.0 |
| 2 | 1077175 | 2400.0 | 36 months | 15.96 | 84.33 | C | C5 | NaN | 10+ years | RENT | 12252.0 | Not Verified | Dec-2011 | Fully Paid | small_business | real estate business | 606xx | 8.72 | Nov-2001 | 2.0 | 0.0 | 2956.0 | 98.5% | 10.0 | f | Individual | NaN | 0.0 |
| 3 | 1076863 | 10000.0 | 36 months | 13.49 | 339.31 | C | C1 | AIR RESOURCES BOARD | 10+ years | RENT | 49200.0 | Source Verified | Dec-2011 | Fully Paid | other | personel | 917xx | 20.00 | Feb-1996 | 10.0 | 0.0 | 5598.0 | 21% | 37.0 | f | Individual | NaN | 0.0 |
| 4 | 1075358 | 3000.0 | 60 months | 12.69 | 67.79 | B | B5 | University Medical Group | 1 year | RENT | 80000.0 | Source Verified | Dec-2011 | Fully Paid | other | Personal | 972xx | 17.94 | Jan-1996 | 15.0 | 0.0 | 27783.0 | 53.9% | 38.0 | f | Individual | NaN | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2925488 | 102556443 | 24000.0 | 60 months | 23.99 | 690.30 | E | E2 | Senior Escrow Officer | < 1 year | RENT | 107000.0 | Source Verified | Apr-2017 | Charged Off | other | Other | 801xx | 11.65 | Apr-1995 | 13.0 | 2.0 | 9688.0 | 24.9% | 24.0 | f | Individual | 0.0 | 1.0 |
| 2925489 | 102653304 | 10000.0 | 36 months | 7.99 | 313.32 | A | A5 | Rn | 10+ years | MORTGAGE | 65000.0 | Source Verified | Apr-2017 | Fully Paid | debt_consolidation | Debt consolidation | 187xx | 19.55 | Sep-1993 | 15.0 | 0.0 | 9751.0 | 15.7% | 27.0 | w | Individual | 0.0 | 0.0 |
| 2925490 | 102628603 | 10050.0 | 36 months | 16.99 | 358.26 | D | D1 | Sales Associate | 8 years | RENT | 37000.0 | Not Verified | Apr-2017 | Charged Off | debt_consolidation | Debt consolidation | 220xx | 20.56 | May-1993 | 15.0 | 0.0 | 14300.0 | 47% | 21.0 | w | Individual | 0.0 | 0.0 |
| 2925491 | 102196576 | 6000.0 | 36 months | 11.44 | 197.69 | B | B4 | Contact input | 5 years | RENT | 41000.0 | Source Verified | Apr-2017 | Fully Paid | credit_card | Credit card refinancing | 105xx | 19.99 | May-1990 | 9.0 | 0.0 | 1356.0 | 10.1% | 18.0 | w | Individual | 0.0 | 0.0 |
| 2925492 | 99799684 | 30000.0 | 60 months | 25.49 | 889.18 | E | E4 | Assistant Manager | 4 years | MORTGAGE | 105700.0 | Verified | Apr-2017 | Charged Off | debt_consolidation | Debt consolidation | 797xx | 27.26 | Nov-1997 | 12.0 | 0.0 | 15252.0 | 72.6% | 23.0 | w | Individual | 3.0 | 0.0 |
2925493 rows × 28 columns
# Convert revol_util from str to float
train_df2['revol_util'] = train_df['revol_util'].str[:-1]
train_df2['revol_util'] = train_df2['revol_util'].astype(float)
train_df2['revol_util'].dtype
dtype('float64')
# Convert earliest_cr_line from str to datetime
train_df2['earliest_cr_line'] = pd.to_datetime(train_df['earliest_cr_line'], format = '%b-%Y')
train_df2['earliest_cr_line']
0 1985-01-01
1 1999-04-01
2 2001-11-01
3 1996-02-01
4 1996-01-01
...
2925488 1995-04-01
2925489 1993-09-01
2925490 1993-05-01
2925491 1990-05-01
2925492 1997-11-01
Name: earliest_cr_line, Length: 2925493, dtype: datetime64[ns]
# Convert issue_d from str to datetime
train_df2['issue_d'] = pd.to_datetime(train_df['issue_d'], format = '%b-%Y').dt.date
train_df2['issue_d']
0 2011-12-01
1 2011-12-01
2 2011-12-01
3 2011-12-01
4 2011-12-01
...
2925488 2017-04-01
2925489 2017-04-01
2925490 2017-04-01
2925491 2017-04-01
2925492 2017-04-01
Name: issue_d, Length: 2925493, dtype: object
numeric = train_df2.select_dtypes(exclude = 'object')
numeric
| loan_amnt | int_rate | installment | annual_inc | dti | earliest_cr_line | open_acc | pub_rec | revol_bal | revol_util | total_acc | mort_acc | pub_rec_bankruptcies | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5000.0 | 10.65 | 162.87 | 24000.0 | 27.65 | 1985-01-01 | 3.0 | 0.0 | 13648.0 | 83.7 | 9.0 | NaN | 0.0 |
| 1 | 2500.0 | 15.27 | 59.83 | 30000.0 | 1.00 | 1999-04-01 | 3.0 | 0.0 | 1687.0 | 9.4 | 4.0 | NaN | 0.0 |
| 2 | 2400.0 | 15.96 | 84.33 | 12252.0 | 8.72 | 2001-11-01 | 2.0 | 0.0 | 2956.0 | 98.5 | 10.0 | NaN | 0.0 |
| 3 | 10000.0 | 13.49 | 339.31 | 49200.0 | 20.00 | 1996-02-01 | 10.0 | 0.0 | 5598.0 | 21.0 | 37.0 | NaN | 0.0 |
| 4 | 3000.0 | 12.69 | 67.79 | 80000.0 | 17.94 | 1996-01-01 | 15.0 | 0.0 | 27783.0 | 53.9 | 38.0 | NaN | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2925488 | 24000.0 | 23.99 | 690.30 | 107000.0 | 11.65 | 1995-04-01 | 13.0 | 2.0 | 9688.0 | 24.9 | 24.0 | 0.0 | 1.0 |
| 2925489 | 10000.0 | 7.99 | 313.32 | 65000.0 | 19.55 | 1993-09-01 | 15.0 | 0.0 | 9751.0 | 15.7 | 27.0 | 0.0 | 0.0 |
| 2925490 | 10050.0 | 16.99 | 358.26 | 37000.0 | 20.56 | 1993-05-01 | 15.0 | 0.0 | 14300.0 | 47.0 | 21.0 | 0.0 | 0.0 |
| 2925491 | 6000.0 | 11.44 | 197.69 | 41000.0 | 19.99 | 1990-05-01 | 9.0 | 0.0 | 1356.0 | 10.1 | 18.0 | 0.0 | 0.0 |
| 2925492 | 30000.0 | 25.49 | 889.18 | 105700.0 | 27.26 | 1997-11-01 | 12.0 | 0.0 | 15252.0 | 72.6 | 23.0 | 3.0 | 0.0 |
2925493 rows × 13 columns
full = []
for col in numeric.columns.tolist():
if (numeric[col]==0).sum() != 0:
total = (numeric[col]==0).sum()
percent = round((numeric[col]==0).sum()/len(numeric)*100, 2)
ls = [col, total, percent]
full.append(ls)
else:
pass
numeric_df = pd.DataFrame(full, columns = ['Variable', 'No. of 0s', 'Percent'])
numeric_df
| Variable | No. of 0s | Percent | |
|---|---|---|---|
| 0 | annual_inc | 3054 | 0.10 |
| 1 | dti | 2353 | 0.08 |
| 2 | open_acc | 92 | 0.00 |
| 3 | pub_rec | 2499290 | 85.43 |
| 4 | revol_bal | 18099 | 0.62 |
| 5 | revol_util | 18856 | 0.64 |
| 6 | mort_acc | 1233212 | 42.15 |
| 7 | pub_rec_bankruptcies | 2583966 | 88.33 |
annual_inc: Possible to be 0 as there could be borrower who has no income but just want to borrow money. However do ensure that their dti is either NaN or 0dti: Can be 0 as there may be borrower with no debt# Check to ensure all records with annual income as 0 have their dti as 0 or NaN
if len(numeric[(numeric['annual_inc'] == 0) & (~numeric['dti'].isin([np.nan, 0]))]) == 0:
pass
else:
raise Exception('Pls rectify')
# Remove any columns with more than 70% missing values
train_df2a = train_df2[train_df2.columns[train_df2.isnull().mean() < 0.7]]
print('Num of columns removed:', len(train_df2) - len(train_df2a))
Num of columns removed: 0
# Remove any rows with more than 70% missing values
train_df2b = train_df2[train_df2.isnull().mean(axis = 1) < 0.7]
print('Num of rows removed:', len(train_df2a) - len(train_df2b))
Num of rows removed: 1
full = []
for col in train_df2b.columns.tolist():
if train_df2b[col].isnull().sum() != 0:
total = train_df2b[col].isnull().sum()
percent = round(train_df2b[col].isnull().sum()/len(train_df2b)*100, 2)
ls = [col, total, percent]
full.append(ls)
else:
pass
missing = pd.DataFrame(full, columns = ['Variable', 'No. of Missing Values', 'Percent'])
missing
| Variable | No. of Missing Values | Percent | |
|---|---|---|---|
| 0 | emp_title | 264086 | 9.03 |
| 1 | emp_length | 205220 | 7.01 |
| 2 | annual_inc | 4 | 0.00 |
| 3 | title | 23325 | 0.80 |
| 4 | zip_code | 1 | 0.00 |
| 5 | dti | 3108 | 0.11 |
| 6 | earliest_cr_line | 29 | 0.00 |
| 7 | open_acc | 29 | 0.00 |
| 8 | pub_rec | 29 | 0.00 |
| 9 | revol_util | 2660 | 0.09 |
| 10 | total_acc | 29 | 0.00 |
| 11 | mort_acc | 50030 | 1.71 |
| 12 | pub_rec_bankruptcies | 1365 | 0.05 |
missing[missing['Variable'].isin(train_df2b.select_dtypes(exclude = 'object'))]
| Variable | No. of Missing Values | Percent | |
|---|---|---|---|
| 2 | annual_inc | 4 | 0.00 |
| 5 | dti | 3108 | 0.11 |
| 6 | earliest_cr_line | 29 | 0.00 |
| 7 | open_acc | 29 | 0.00 |
| 8 | pub_rec | 29 | 0.00 |
| 9 | revol_util | 2660 | 0.09 |
| 10 | total_acc | 29 | 0.00 |
| 11 | mort_acc | 50030 | 1.71 |
| 12 | pub_rec_bankruptcies | 1365 | 0.05 |
Imputation of columns with missing values </u>
revol_util, mort_acc, annual_inc, pub_rec_bankruptcies are numeric variables, but are right-skewed. Hence to impute using median
dti can be imputed using mean() as it is not skewed
train_df3 = train_df2b.copy()
train_df3['annual_inc'] = train_df3['annual_inc'].fillna(train_df3['annual_inc'].median())
train_df3['revol_util'] = train_df3['revol_util'].fillna(train_df3['revol_util'].median())
train_df3['mort_acc'] = train_df3['mort_acc'].fillna(train_df3['mort_acc'].median())
train_df3['pub_rec_bankruptcies'] = train_df3['pub_rec_bankruptcies'].fillna(train_df3['pub_rec_bankruptcies'].median())
train_df3['dti'] = train_df3['dti'].fillna(train_df3['dti'].mean())
Notice that 13 records have missing account-related variables. Hence decided to remove these 13 records
train_df3[['open_acc', 'earliest_cr_line', 'open_acc', 'pub_rec', 'total_acc']][train_df3['open_acc'].isna()]
| open_acc | earliest_cr_line | open_acc | pub_rec | total_acc | |
|---|---|---|---|---|---|
| 42450 | NaN | NaT | NaN | NaN | NaN |
| 42451 | NaN | NaT | NaN | NaN | NaN |
| 42460 | NaN | NaT | NaN | NaN | NaN |
| 42473 | NaN | NaT | NaN | NaN | NaN |
| 42481 | NaN | NaT | NaN | NaN | NaN |
| 42484 | NaN | NaT | NaN | NaN | NaN |
| 42495 | NaN | NaT | NaN | NaN | NaN |
| 42510 | NaN | NaT | NaN | NaN | NaN |
| 42515 | NaN | NaT | NaN | NaN | NaN |
| 42516 | NaN | NaT | NaN | NaN | NaN |
| 42517 | NaN | NaT | NaN | NaN | NaN |
| 42518 | NaN | NaT | NaN | NaN | NaN |
| 42519 | NaN | NaT | NaN | NaN | NaN |
| 42520 | NaN | NaT | NaN | NaN | NaN |
| 42521 | NaN | NaT | NaN | NaN | NaN |
| 42522 | NaN | NaT | NaN | NaN | NaN |
| 42523 | NaN | NaT | NaN | NaN | NaN |
| 42524 | NaN | NaT | NaN | NaN | NaN |
| 42525 | NaN | NaT | NaN | NaN | NaN |
| 42526 | NaN | NaT | NaN | NaN | NaN |
| 42527 | NaN | NaT | NaN | NaN | NaN |
| 42528 | NaN | NaT | NaN | NaN | NaN |
| 42529 | NaN | NaT | NaN | NaN | NaN |
| 42530 | NaN | NaT | NaN | NaN | NaN |
| 42531 | NaN | NaT | NaN | NaN | NaN |
| 42532 | NaN | NaT | NaN | NaN | NaN |
| 42533 | NaN | NaT | NaN | NaN | NaN |
| 42534 | NaN | NaT | NaN | NaN | NaN |
| 42535 | NaN | NaT | NaN | NaN | NaN |
train_df3 = train_df3[~train_df3['open_acc'].isna()]
train_df3.select_dtypes(exclude = 'object').isna().sum()
loan_amnt 0 int_rate 0 installment 0 annual_inc 0 dti 0 earliest_cr_line 0 open_acc 0 pub_rec 0 revol_bal 0 revol_util 0 total_acc 0 mort_acc 0 pub_rec_bankruptcies 0 dtype: int64
missing[missing['Variable'].isin(train_df3.select_dtypes(include = 'object'))]
| Variable | No. of Missing Values | Percent | |
|---|---|---|---|
| 0 | emp_title | 264086 | 9.03 |
| 1 | emp_length | 205220 | 7.01 |
| 3 | title | 23325 | 0.80 |
| 4 | zip_code | 1 | 0.00 |
train_df3['emp_length'] = train_df3['emp_length'].fillna(train_df3['emp_length'].mode())
train_df4 = train_df3.drop(['issue_d', 'id'], axis = 1)
train_df4.sample(3)
| loan_amnt | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | loan_status | purpose | title | zip_code | dti | earliest_cr_line | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | application_type | mort_acc | pub_rec_bankruptcies | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2639237 | 26000.0 | 36 months | 19.99 | 966.13 | D | D4 | Business development director | 5 years | MORTGAGE | 92000.0 | Verified | Fully Paid | debt_consolidation | Debt consolidation | 711xx | 15.29 | 2003-11-01 | 6.0 | 0.0 | 24199.0 | 83.4 | 15.0 | f | Individual | 2.0 | 0.0 |
| 1080527 | 10000.0 | 36 months | 16.14 | 352.27 | C | C4 | Valet Supervisor | 2 years | RENT | 38000.0 | Not Verified | Current | debt_consolidation | Debt consolidation | 980xx | 10.20 | 2010-02-01 | 4.0 | 0.0 | 11933.0 | 75.1 | 7.0 | f | Individual | 0.0 | 0.0 |
| 1203866 | 18000.0 | 36 months | 22.50 | 692.10 | D | D3 | Occupational Therapist | 10+ years | OWN | 60000.0 | Verified | Current | credit_card | Credit card refinancing | 620xx | 37.84 | 2001-05-01 | 8.0 | 0.0 | 28691.0 | 63.6 | 14.0 | w | Individual | 1.0 | 0.0 |
issue_d may not be useful for prediction as this is done after the issuance of loan
</br> id are removed as they are not predictor variable but identifiers
loan_status_dict = train_df4['loan_status'].value_counts().to_dict()
plt.barh(list(loan_status_dict.keys()), list(loan_status_dict.values()))
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
for i, v in enumerate(train_df4['loan_status'].value_counts()):
plt.text(v, i, str(v), fontsize = 12, ha='left')
plt.show()
Relabelling the Loan Status
From the above visualisation, we observed that 10 type of loan status exist in this data set. We are only interested in 2 status i.e. Defaulted and Not Defaulted. Hence, we will need to add a new variable which will be binary (0s and 1s).
All those loans, whose status is “Fully Paid”, “Current” will be categorized as Not Defaulted and anything else will be categorized as Defaulted. To achieve this we will introduce new variable defaulted.
# First we define the function
def change_loan_status(loan_status):
if loan_status in ['Fully Paid', 'Current']:
return 0
else:
return 1
# Next we apply the function
train_df4['loan_status'] = train_df4['loan_status'].apply(change_loan_status)
train_df4.head()
| loan_amnt | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | loan_status | purpose | title | zip_code | dti | earliest_cr_line | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | application_type | mort_acc | pub_rec_bankruptcies | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5000.0 | 36 months | 10.65 | 162.87 | B | B2 | NaN | 10+ years | RENT | 24000.0 | Verified | 0 | credit_card | Computer | 860xx | 27.65 | 1985-01-01 | 3.0 | 0.0 | 13648.0 | 83.7 | 9.0 | f | Individual | 1.0 | 0.0 |
| 1 | 2500.0 | 60 months | 15.27 | 59.83 | C | C4 | Ryder | < 1 year | RENT | 30000.0 | Source Verified | 1 | car | bike | 309xx | 1.00 | 1999-04-01 | 3.0 | 0.0 | 1687.0 | 9.4 | 4.0 | f | Individual | 1.0 | 0.0 |
| 2 | 2400.0 | 36 months | 15.96 | 84.33 | C | C5 | NaN | 10+ years | RENT | 12252.0 | Not Verified | 0 | small_business | real estate business | 606xx | 8.72 | 2001-11-01 | 2.0 | 0.0 | 2956.0 | 98.5 | 10.0 | f | Individual | 1.0 | 0.0 |
| 3 | 10000.0 | 36 months | 13.49 | 339.31 | C | C1 | AIR RESOURCES BOARD | 10+ years | RENT | 49200.0 | Source Verified | 0 | other | personel | 917xx | 20.00 | 1996-02-01 | 10.0 | 0.0 | 5598.0 | 21.0 | 37.0 | f | Individual | 1.0 | 0.0 |
| 4 | 3000.0 | 60 months | 12.69 | 67.79 | B | B5 | University Medical Group | 1 year | RENT | 80000.0 | Source Verified | 0 | other | Personal | 972xx | 17.94 | 1996-01-01 | 15.0 | 0.0 | 27783.0 | 53.9 | 38.0 | f | Individual | 1.0 | 0.0 |
Once again, we apply the value_counts() to see the number of instances of each unique status in the loan_status data column. Now we can see that the loan status only has 2 category.
loanstatus = train_df4['loan_status'].value_counts()
sns.barplot(loanstatus.index, loanstatus.values)
<AxesSubplot:>
train_df4.select_dtypes('object').columns
Index(['term', 'grade', 'sub_grade', 'emp_title', 'emp_length',
'home_ownership', 'verification_status', 'purpose', 'title', 'zip_code',
'initial_list_status', 'application_type'],
dtype='object')
for i in train_df4.term.unique():
print(i)
print(f"{train_df4[train_df4.term == i].loan_status.value_counts(normalize=True).round(3)}")
print('==========================================')
36 months 0 0.884 1 0.116 Name: loan_status, dtype: float64 ========================================== 60 months 0 0.818 1 0.182 Name: loan_status, dtype: float64 ==========================================
for i in train_df4.emp_length.unique():
print(i)
print(f"{train_df4[train_df4.emp_length == i].loan_status.value_counts(normalize=True).round(3)}")
print('==========================================')
10+ years 0 0.873 1 0.127 Name: loan_status, dtype: float64 ========================================== < 1 year 0 0.869 1 0.131 Name: loan_status, dtype: float64 ========================================== 1 year 0 0.86 1 0.14 Name: loan_status, dtype: float64 ========================================== 3 years 0 0.863 1 0.137 Name: loan_status, dtype: float64 ========================================== 8 years 0 0.856 1 0.144 Name: loan_status, dtype: float64 ========================================== 9 years 0 0.857 1 0.143 Name: loan_status, dtype: float64 ========================================== 4 years 0 0.864 1 0.136 Name: loan_status, dtype: float64 ========================================== 5 years 0 0.866 1 0.134 Name: loan_status, dtype: float64 ========================================== 6 years 0 0.866 1 0.134 Name: loan_status, dtype: float64 ========================================== 2 years 0 0.865 1 0.135 Name: loan_status, dtype: float64 ========================================== 7 years 0 0.862 1 0.138 Name: loan_status, dtype: float64 ========================================== nan Series([], Name: loan_status, dtype: float64) ==========================================
for i in train_df4.sort_values('sub_grade')['sub_grade'].unique():
print(i)
print(f"{train_df4[train_df4.sub_grade == i].loan_status.value_counts(normalize=True).round(3)}")
print('==========================================')
A1 0 0.976 1 0.024 Name: loan_status, dtype: float64 ========================================== A2 0 0.967 1 0.033 Name: loan_status, dtype: float64 ========================================== A3 0 0.96 1 0.04 Name: loan_status, dtype: float64 ========================================== A4 0 0.955 1 0.045 Name: loan_status, dtype: float64 ========================================== A5 0 0.941 1 0.059 Name: loan_status, dtype: float64 ========================================== B1 0 0.926 1 0.074 Name: loan_status, dtype: float64 ========================================== B2 0 0.918 1 0.082 Name: loan_status, dtype: float64 ========================================== B3 0 0.902 1 0.098 Name: loan_status, dtype: float64 ========================================== B4 0 0.897 1 0.103 Name: loan_status, dtype: float64 ========================================== B5 0 0.883 1 0.117 Name: loan_status, dtype: float64 ========================================== C1 0 0.864 1 0.136 Name: loan_status, dtype: float64 ========================================== C2 0 0.85 1 0.15 Name: loan_status, dtype: float64 ========================================== C3 0 0.841 1 0.159 Name: loan_status, dtype: float64 ========================================== C4 0 0.82 1 0.18 Name: loan_status, dtype: float64 ========================================== C5 0 0.818 1 0.182 Name: loan_status, dtype: float64 ========================================== D1 0 0.798 1 0.202 Name: loan_status, dtype: float64 ========================================== D2 0 0.809 1 0.191 Name: loan_status, dtype: float64 ========================================== D3 0 0.783 1 0.217 Name: loan_status, dtype: float64 ========================================== D4 0 0.754 1 0.246 Name: loan_status, dtype: float64 ========================================== D5 0 0.741 1 0.259 Name: loan_status, dtype: float64 ========================================== E1 0 0.695 1 0.305 Name: loan_status, dtype: float64 ========================================== E2 0 0.677 1 0.323 Name: loan_status, dtype: float64 ========================================== E3 0 0.666 1 0.334 Name: loan_status, dtype: float64 ========================================== E4 0 0.653 1 0.347 Name: loan_status, dtype: float64 ========================================== E5 0 0.649 1 0.351 Name: loan_status, dtype: float64 ========================================== F1 0 0.623 1 0.377 Name: loan_status, dtype: float64 ========================================== F2 0 0.592 1 0.408 Name: loan_status, dtype: float64 ========================================== F3 0 0.592 1 0.408 Name: loan_status, dtype: float64 ========================================== F4 0 0.563 1 0.437 Name: loan_status, dtype: float64 ========================================== F5 0 0.553 1 0.447 Name: loan_status, dtype: float64 ========================================== G1 0 0.572 1 0.428 Name: loan_status, dtype: float64 ========================================== G2 0 0.553 1 0.447 Name: loan_status, dtype: float64 ========================================== G3 0 0.535 1 0.465 Name: loan_status, dtype: float64 ========================================== G4 0 0.523 1 0.477 Name: loan_status, dtype: float64 ========================================== G5 0 0.503 1 0.497 Name: loan_status, dtype: float64 ==========================================
Interestingly, loan grade of similar character have seemingly same loan distribution. Hence to drop sub_grade and keep grade
for i in train_df4.home_ownership.unique():
print(i)
print(f"{train_df4[train_df4.home_ownership == i].loan_status.value_counts(normalize=True).round(3)}")
print('==========================================')
RENT 0 0.843 1 0.157 Name: loan_status, dtype: float64 ========================================== OWN 0 0.863 1 0.137 Name: loan_status, dtype: float64 ========================================== MORTGAGE 0 0.882 1 0.118 Name: loan_status, dtype: float64 ========================================== OTHER 0 0.643 1 0.357 Name: loan_status, dtype: float64 ========================================== NONE 0 0.843 1 0.157 Name: loan_status, dtype: float64 ========================================== ANY 0 0.91 1 0.09 Name: loan_status, dtype: float64 ==========================================
for i in train_df4.verification_status.unique():
print(i)
print(f"{train_df4[train_df4.verification_status == i].loan_status.value_counts(normalize=True).round(3)}")
print('==========================================')
Verified 0 0.814 1 0.186 Name: loan_status, dtype: float64 ========================================== Source Verified 0 0.86 1 0.14 Name: loan_status, dtype: float64 ========================================== Not Verified 0 0.903 1 0.097 Name: loan_status, dtype: float64 ==========================================
for i in train_df4.purpose.unique():
print(i)
print(f"{train_df4[train_df4.purpose== i].loan_status.value_counts(normalize=True).round(3)}")
print('==========================================')
credit_card 0 0.889 1 0.111 Name: loan_status, dtype: float64 ========================================== car 0 0.895 1 0.105 Name: loan_status, dtype: float64 ========================================== small_business 0 0.786 1 0.214 Name: loan_status, dtype: float64 ========================================== other 0 0.859 1 0.141 Name: loan_status, dtype: float64 ========================================== wedding 0 0.857 1 0.143 Name: loan_status, dtype: float64 ========================================== debt_consolidation 0 0.854 1 0.146 Name: loan_status, dtype: float64 ========================================== home_improvement 0 0.878 1 0.122 Name: loan_status, dtype: float64 ========================================== major_purchase 0 0.864 1 0.136 Name: loan_status, dtype: float64 ========================================== medical 0 0.857 1 0.143 Name: loan_status, dtype: float64 ========================================== moving 0 0.835 1 0.165 Name: loan_status, dtype: float64 ========================================== vacation 0 0.873 1 0.127 Name: loan_status, dtype: float64 ========================================== house 0 0.857 1 0.143 Name: loan_status, dtype: float64 ========================================== renewable_energy 0 0.833 1 0.167 Name: loan_status, dtype: float64 ========================================== educational 0 0.639 1 0.361 Name: loan_status, dtype: float64 ==========================================
for i in train_df4.initial_list_status.unique():
print(i)
print(f"{train_df4[train_df4.initial_list_status== i].loan_status.value_counts(normalize=True).round(3)}")
print('==========================================')
f 0 0.83 1 0.17 Name: loan_status, dtype: float64 ========================================== w 0 0.877 1 0.123 Name: loan_status, dtype: float64 ==========================================
for i in train_df4.application_type.unique():
print(i)
print(f"{train_df4[train_df4.application_type== i].loan_status.value_counts(normalize=True).round(3)}")
print('==========================================')
Individual 0 0.862 1 0.138 Name: loan_status, dtype: float64 ========================================== Joint App 0 0.897 1 0.103 Name: loan_status, dtype: float64 ==========================================
train_df4['loan_status'] = train_df4['loan_status'].apply(str)
corr = train_df4.corr().round(3)
fig = px.imshow(corr, color_continuous_scale = 'plasma', text_auto = True, aspect = 'auto')
fig.show()
From the heat map, we can deduce strong positive correlation (>=0.5) between different variables:
There is an absence of strong negative correlation between variables
from itertools import product
import scipy.stats as ss
# Create 2 list of cat variables
col_cat1 = train_df4.select_dtypes(include = 'object').columns
col_cat2 = train_df4.select_dtypes(include = 'object').columns.sort_values()
cat_combi = list(product(col_cat1, col_cat2, repeat = 1))
cat_combi_1 = set(cat_combi) - set([('title', 'emp_title'), ('emp_title', 'title')])
cat_combi_1
{('application_type', 'application_type'),
('application_type', 'emp_length'),
('application_type', 'emp_title'),
('application_type', 'grade'),
('application_type', 'home_ownership'),
('application_type', 'initial_list_status'),
('application_type', 'loan_status'),
('application_type', 'purpose'),
('application_type', 'sub_grade'),
('application_type', 'term'),
('application_type', 'title'),
('application_type', 'verification_status'),
('application_type', 'zip_code'),
('emp_length', 'application_type'),
('emp_length', 'emp_length'),
('emp_length', 'emp_title'),
('emp_length', 'grade'),
('emp_length', 'home_ownership'),
('emp_length', 'initial_list_status'),
('emp_length', 'loan_status'),
('emp_length', 'purpose'),
('emp_length', 'sub_grade'),
('emp_length', 'term'),
('emp_length', 'title'),
('emp_length', 'verification_status'),
('emp_length', 'zip_code'),
('emp_title', 'application_type'),
('emp_title', 'emp_length'),
('emp_title', 'emp_title'),
('emp_title', 'grade'),
('emp_title', 'home_ownership'),
('emp_title', 'initial_list_status'),
('emp_title', 'loan_status'),
('emp_title', 'purpose'),
('emp_title', 'sub_grade'),
('emp_title', 'term'),
('emp_title', 'verification_status'),
('emp_title', 'zip_code'),
('grade', 'application_type'),
('grade', 'emp_length'),
('grade', 'emp_title'),
('grade', 'grade'),
('grade', 'home_ownership'),
('grade', 'initial_list_status'),
('grade', 'loan_status'),
('grade', 'purpose'),
('grade', 'sub_grade'),
('grade', 'term'),
('grade', 'title'),
('grade', 'verification_status'),
('grade', 'zip_code'),
('home_ownership', 'application_type'),
('home_ownership', 'emp_length'),
('home_ownership', 'emp_title'),
('home_ownership', 'grade'),
('home_ownership', 'home_ownership'),
('home_ownership', 'initial_list_status'),
('home_ownership', 'loan_status'),
('home_ownership', 'purpose'),
('home_ownership', 'sub_grade'),
('home_ownership', 'term'),
('home_ownership', 'title'),
('home_ownership', 'verification_status'),
('home_ownership', 'zip_code'),
('initial_list_status', 'application_type'),
('initial_list_status', 'emp_length'),
('initial_list_status', 'emp_title'),
('initial_list_status', 'grade'),
('initial_list_status', 'home_ownership'),
('initial_list_status', 'initial_list_status'),
('initial_list_status', 'loan_status'),
('initial_list_status', 'purpose'),
('initial_list_status', 'sub_grade'),
('initial_list_status', 'term'),
('initial_list_status', 'title'),
('initial_list_status', 'verification_status'),
('initial_list_status', 'zip_code'),
('loan_status', 'application_type'),
('loan_status', 'emp_length'),
('loan_status', 'emp_title'),
('loan_status', 'grade'),
('loan_status', 'home_ownership'),
('loan_status', 'initial_list_status'),
('loan_status', 'loan_status'),
('loan_status', 'purpose'),
('loan_status', 'sub_grade'),
('loan_status', 'term'),
('loan_status', 'title'),
('loan_status', 'verification_status'),
('loan_status', 'zip_code'),
('purpose', 'application_type'),
('purpose', 'emp_length'),
('purpose', 'emp_title'),
('purpose', 'grade'),
('purpose', 'home_ownership'),
('purpose', 'initial_list_status'),
('purpose', 'loan_status'),
('purpose', 'purpose'),
('purpose', 'sub_grade'),
('purpose', 'term'),
('purpose', 'title'),
('purpose', 'verification_status'),
('purpose', 'zip_code'),
('sub_grade', 'application_type'),
('sub_grade', 'emp_length'),
('sub_grade', 'emp_title'),
('sub_grade', 'grade'),
('sub_grade', 'home_ownership'),
('sub_grade', 'initial_list_status'),
('sub_grade', 'loan_status'),
('sub_grade', 'purpose'),
('sub_grade', 'sub_grade'),
('sub_grade', 'term'),
('sub_grade', 'title'),
('sub_grade', 'verification_status'),
('sub_grade', 'zip_code'),
('term', 'application_type'),
('term', 'emp_length'),
('term', 'emp_title'),
('term', 'grade'),
('term', 'home_ownership'),
('term', 'initial_list_status'),
('term', 'loan_status'),
('term', 'purpose'),
('term', 'sub_grade'),
('term', 'term'),
('term', 'title'),
('term', 'verification_status'),
('term', 'zip_code'),
('title', 'application_type'),
('title', 'emp_length'),
('title', 'grade'),
('title', 'home_ownership'),
('title', 'initial_list_status'),
('title', 'loan_status'),
('title', 'purpose'),
('title', 'sub_grade'),
('title', 'term'),
('title', 'title'),
('title', 'verification_status'),
('title', 'zip_code'),
('verification_status', 'application_type'),
('verification_status', 'emp_length'),
('verification_status', 'emp_title'),
('verification_status', 'grade'),
('verification_status', 'home_ownership'),
('verification_status', 'initial_list_status'),
('verification_status', 'loan_status'),
('verification_status', 'purpose'),
('verification_status', 'sub_grade'),
('verification_status', 'term'),
('verification_status', 'title'),
('verification_status', 'verification_status'),
('verification_status', 'zip_code'),
('zip_code', 'application_type'),
('zip_code', 'emp_length'),
('zip_code', 'emp_title'),
('zip_code', 'grade'),
('zip_code', 'home_ownership'),
('zip_code', 'initial_list_status'),
('zip_code', 'loan_status'),
('zip_code', 'purpose'),
('zip_code', 'sub_grade'),
('zip_code', 'term'),
('zip_code', 'title'),
('zip_code', 'verification_status'),
('zip_code', 'zip_code')}
# def cramers_v(x, y):
# confusion_matrix = pd.crosstab(x,y)
# chi2 = ss.chi2_contingency(confusion_matrix)[0]
# n = confusion_matrix.sum().sum()
# phi2 = chi2/n
# r,k = confusion_matrix.shape
# phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
# rcorr = r-((r-1)**2)/(n-1)
# kcorr = k-((k-1)**2)/(n-1)
# return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
# result = []
# for i in cat_combi_1:
# if i[0] != i[1]:
# print(i[0], i[1])
# result.append((i[0],i[1], cramers_v(train_df4[i[0]], train_df4[i[1]])))
# else:
# pass
# chitest = pd.DataFrame(result, columns = ['var1', 'var2', 'coeff'] )
# chitest_pvt = chitest.pivot(index='var1', columns='var2', values='coeff').round(3)
# fig = px.imshow(chitest_pvt, color_continuous_scale = 'plasma', text_auto = True, aspect = 'auto')
# fig.show()
From the heat map, we can deduce strong positive correlation (>=0.5) between different variables:
There is an absence of strong negative correlation between variables
train_df4_0 = train_df4[train_df4['loan_status']=='0'].sample(frac = 0.25, ignore_index = True)
train_df4_1 = train_df4[train_df4['loan_status']=='1']
train_df5 = pd.concat([train_df4_0, train_df4_1], axis = 0, ignore_index = True)
len(train_df5)
1028864
train_df5['term'] = train_df5['term'].str.strip().str[:2]
train_df5['term'].unique()
array(['36', '60'], dtype=object)
Info about US Postal Code:
To reduce the cardinality of zip code column, so that the com has enough RAM to run a smaller dataset after one-hot encoding, reduce it to 1st two digits
train_df5['zip_code'] = train_df5['zip_code'].str[:2]
train_df5['zip_code'].unique()
array(['02', '78', '63', '95', '12', '11', '35', '33', '72', '80', '44',
'76', '92', '10', '67', '37', '23', '29', '91', '15', '97', '08',
'49', '20', '53', '94', '21', '22', '60', '85', '03', '05', '83',
'30', '93', '55', '57', '24', '32', '99', '90', '34', '77', '79',
'18', '48', '81', '68', '19', '54', '17', '87', '96', '27', '89',
'75', '01', '61', '28', '46', '98', '26', '36', '43', '07', '64',
'41', '59', '38', '66', '45', '70', '06', '62', '40', '73', '14',
'74', '47', '84', '31', '42', '04', '13', '39', '86', '65', '82',
'88', '71', '58', '16', '25', '56', '69', '52', '09', '50', '00',
nan, '51'], dtype=object)
train_df5['Year_cr_line'] = pd.to_datetime(train_df5['earliest_cr_line'], format = '%b-%Y').dt.year.astype(str)
train_df5['Year_cr_line']
0 2000
1 1999
2 2000
3 2007
4 2001
...
1028859 2002
1028860 2005
1028861 1995
1028862 1993
1028863 1997
Name: Year_cr_line, Length: 1028864, dtype: object
issue_d and earliest_cr_line, hence to drop these columns for incomplete covereage
train_df6 = train_df5.sort_values(['grade'])
train_df6['grade'].unique()
labelencoder = LabelEncoder()
train_df6['grade'] = labelencoder.fit_transform(train_df6['grade'])
train_df6
| loan_amnt | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | loan_status | purpose | title | zip_code | dti | earliest_cr_line | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | application_type | mort_acc | pub_rec_bankruptcies | Year_cr_line | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 481659 | 27650.0 | 36 | 8.19 | 868.88 | 0 | A4 | Audit Consultant | 5 years | OWN | 90000.0 | Source Verified | 0 | debt_consolidation | Debt consolidation | 21 | 29.08 | 1997-04-01 | 12.0 | 0.0 | 14096.0 | 31.6 | 24.0 | w | Individual | 1.0 | 0.0 | 1997 |
| 369318 | 9000.0 | 36 | 10.81 | 293.84 | 0 | A5 | Facility Manager | 4 years | RENT | 53000.0 | Not Verified | 0 | debt_consolidation | Debt consolidation | 23 | 26.86 | 2013-01-01 | 29.0 | 0.0 | 18990.0 | 34.7 | 38.0 | w | Individual | 0.0 | 0.0 | 2013 |
| 173152 | 24000.0 | 60 | 8.19 | 488.82 | 0 | A4 | DSR | 10+ years | MORTGAGE | 94000.0 | Source Verified | 0 | debt_consolidation | Debt consolidation | 71 | 23.38 | 1993-12-01 | 11.0 | 0.0 | 21936.0 | 28.5 | 24.0 | w | Joint App | 3.0 | 0.0 | 1993 |
| 173153 | 12500.0 | 36 | 6.49 | 383.06 | 0 | A2 | Marketing Manager | 5 years | MORTGAGE | 69500.0 | Not Verified | 0 | debt_consolidation | Debt consolidation | 33 | 14.50 | 2005-01-01 | 14.0 | 0.0 | 14163.0 | 32.5 | 28.0 | w | Individual | 1.0 | 0.0 | 2005 |
| 902087 | 5600.0 | 36 | 7.89 | 175.20 | 0 | A5 | Legal Assistant | 9 years | MORTGAGE | 55000.0 | Not Verified | 1 | major_purchase | Major purchase | 90 | 13.11 | 1996-06-01 | 5.0 | 0.0 | 4771.0 | 71.2 | 21.0 | w | Individual | 2.0 | 0.0 | 1996 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 418365 | 11525.0 | 60 | 28.99 | 365.76 | 6 | G5 | Document control | < 1 year | RENT | 28800.0 | Source Verified | 0 | debt_consolidation | Debt consolidation | 91 | 40.08 | 2007-09-01 | 14.0 | 0.0 | 14701.0 | 53.7 | 17.0 | f | Individual | 0.0 | 0.0 | 2007 |
| 802582 | 30000.0 | 60 | 25.80 | 894.67 | 6 | G1 | Director of clinic operations | 2 years | MORTGAGE | 120000.0 | Verified | 1 | debt_consolidation | Debt consolidation | 14 | 16.12 | 2001-09-01 | 14.0 | 0.0 | 18020.0 | 49.6 | 46.0 | f | Individual | 3.0 | 0.0 | 2001 |
| 130405 | 35000.0 | 60 | 30.79 | 1149.42 | 6 | G1 | operator | 10+ years | MORTGAGE | 80000.0 | Verified | 0 | debt_consolidation | Debt consolidation | 75 | 17.48 | 2006-09-01 | 9.0 | 0.0 | 950.0 | 25.0 | 22.0 | w | Individual | 5.0 | 0.0 | 2006 |
| 861165 | 21000.0 | 60 | 26.77 | 638.36 | 6 | G1 | DIRECTOR OF OPERATIONS | 2 years | RENT | 105000.0 | Verified | 1 | vacation | Vacation | 91 | 5.03 | 2008-03-01 | 13.0 | 0.0 | 3053.0 | 38.6 | 16.0 | f | Individual | 0.0 | 0.0 | 2008 |
| 409828 | 30000.0 | 60 | 25.80 | 894.67 | 6 | G1 | Data Systems Project Director | 3 years | RENT | 92851.2 | Verified | 0 | debt_consolidation | Debt consolidation | 85 | 12.73 | 2002-10-01 | 6.0 | 0.0 | 21961.0 | 88.2 | 13.0 | w | Individual | 4.0 | 0.0 | 2002 |
1028864 rows × 27 columns
sub_grade is the subset of grade, hence grade to be kept and discard sub_grade which may cause multicollinearityemp_title, title to be removed due to high cardinalityemp_length, initial_list_status to be removed due to similar distribution of loan status across different employment lengthpseudo_df = train_df6.drop([ 'sub_grade', 'emp_title', 'emp_length', 'initial_list_status', 'title', 'earliest_cr_line'], axis =1)
pseudo_df.select_dtypes('object').columns
Index(['term', 'home_ownership', 'verification_status', 'loan_status',
'purpose', 'zip_code', 'application_type', 'Year_cr_line'],
dtype='object')
dummies = pseudo_df.select_dtypes('object').columns.tolist()
dummies.remove('loan_status') # have to do on separate row, cant put concurrently abv. remove mutates the list in-place
dummies
['term', 'home_ownership', 'verification_status', 'purpose', 'zip_code', 'application_type', 'Year_cr_line']
train_df7 = train_df6.copy()
train_df7['verification_status'] = np.where(train_df7['verification_status'] == 'Source Verified', 'Verified', train_df7['verification_status'])
train_df7['home_ownership'].replace(['NONE', 'ANY'], 'OTHER', inplace = True)
train_df7['home_ownership'] = train_df7['home_ownership'].str.title()
train_df7['application_type'] = train_df7['application_type'].str.title()
for i in dummies:
train_df7[i] = train_df7[i].str.title()
print(train_df7[i].unique())
['36' '60'] ['Own' 'Rent' 'Mortgage' 'Other'] ['Verified' 'Not Verified'] ['Debt_Consolidation' 'Major_Purchase' 'Credit_Card' 'Other' 'Home_Improvement' 'Moving' 'Car' 'Vacation' 'Medical' 'House' 'Small_Business' 'Renewable_Energy' 'Wedding' 'Educational'] ['21' '23' '71' '33' '90' '76' '34' '11' '30' '32' '06' '44' '14' '40' '59' '72' '95' '02' '94' '75' '49' '73' '80' '93' '18' '48' '42' '55' '53' '62' '27' '85' '04' '43' '70' '10' '89' '08' '20' '47' '28' '46' '99' '01' '60' '79' '38' '37' '54' '64' '61' '92' '98' '67' '91' '03' '35' '77' '24' '07' '97' '17' '13' '29' '84' '96' '78' '19' '15' '66' '31' '86' '22' '36' '45' '41' '63' '58' '05' '82' '25' '81' '68' '56' '26' '65' '16' '57' '87' '12' '39' '74' '69' '83' '88' '09' '50' '00' '52' '51' nan] ['Individual' 'Joint App'] ['1997' '2013' '1993' '2005' '1996' '2001' '1991' '1998' '1976' '1999' '2007' '2002' '1968' '1984' '1995' '2004' '1994' '2000' '2012' '1987' '2003' '2006' '2008' '1986' '2011' '2014' '1983' '1990' '1992' '1985' '1978' '2010' '1979' '1982' '1989' '1981' '2009' '2015' '1988' '1967' '1977' '1975' '2016' '1971' '1980' '1972' '1962' '1973' '1963' '1970' '1974' '1969' '1965' '1961' '1952' '1966' '1959' '1964' '2017' '1960' '1956' '1958' '1955' '1957' '1944' '1953' '1948' '1950' '1951' '1954' '1946' '1949' '1934']
train_df8 = pd.get_dummies(train_df7, columns = dummies, drop_first = True)
# train_df4.drop(dummies, axis = 1, inplace = True)
train_df8
| loan_amnt | int_rate | installment | grade | sub_grade | emp_title | emp_length | annual_inc | loan_status | title | dti | earliest_cr_line | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | mort_acc | pub_rec_bankruptcies | term_60 | home_ownership_Other | home_ownership_Own | home_ownership_Rent | verification_status_Verified | purpose_Credit_Card | purpose_Debt_Consolidation | purpose_Educational | purpose_Home_Improvement | purpose_House | purpose_Major_Purchase | purpose_Medical | purpose_Moving | purpose_Other | purpose_Renewable_Energy | purpose_Small_Business | purpose_Vacation | purpose_Wedding | zip_code_01 | zip_code_02 | zip_code_03 | zip_code_04 | zip_code_05 | zip_code_06 | zip_code_07 | zip_code_08 | zip_code_09 | zip_code_10 | zip_code_11 | zip_code_12 | zip_code_13 | zip_code_14 | zip_code_15 | zip_code_16 | zip_code_17 | zip_code_18 | zip_code_19 | zip_code_20 | zip_code_21 | zip_code_22 | zip_code_23 | zip_code_24 | zip_code_25 | zip_code_26 | zip_code_27 | zip_code_28 | zip_code_29 | zip_code_30 | zip_code_31 | zip_code_32 | zip_code_33 | zip_code_34 | zip_code_35 | zip_code_36 | zip_code_37 | zip_code_38 | zip_code_39 | zip_code_40 | zip_code_41 | zip_code_42 | zip_code_43 | zip_code_44 | zip_code_45 | zip_code_46 | zip_code_47 | zip_code_48 | zip_code_49 | zip_code_50 | zip_code_51 | zip_code_52 | zip_code_53 | zip_code_54 | zip_code_55 | zip_code_56 | zip_code_57 | zip_code_58 | zip_code_59 | zip_code_60 | zip_code_61 | zip_code_62 | zip_code_63 | zip_code_64 | zip_code_65 | zip_code_66 | zip_code_67 | zip_code_68 | zip_code_69 | zip_code_70 | zip_code_71 | zip_code_72 | zip_code_73 | zip_code_74 | zip_code_75 | zip_code_76 | zip_code_77 | zip_code_78 | zip_code_79 | zip_code_80 | zip_code_81 | zip_code_82 | zip_code_83 | zip_code_84 | zip_code_85 | zip_code_86 | zip_code_87 | zip_code_88 | zip_code_89 | zip_code_90 | zip_code_91 | zip_code_92 | zip_code_93 | zip_code_94 | zip_code_95 | zip_code_96 | zip_code_97 | zip_code_98 | zip_code_99 | application_type_Joint App | Year_cr_line_1944 | Year_cr_line_1946 | Year_cr_line_1948 | Year_cr_line_1949 | Year_cr_line_1950 | Year_cr_line_1951 | Year_cr_line_1952 | Year_cr_line_1953 | Year_cr_line_1954 | Year_cr_line_1955 | Year_cr_line_1956 | Year_cr_line_1957 | Year_cr_line_1958 | Year_cr_line_1959 | Year_cr_line_1960 | Year_cr_line_1961 | Year_cr_line_1962 | Year_cr_line_1963 | Year_cr_line_1964 | Year_cr_line_1965 | Year_cr_line_1966 | Year_cr_line_1967 | Year_cr_line_1968 | Year_cr_line_1969 | Year_cr_line_1970 | Year_cr_line_1971 | Year_cr_line_1972 | Year_cr_line_1973 | Year_cr_line_1974 | Year_cr_line_1975 | Year_cr_line_1976 | Year_cr_line_1977 | Year_cr_line_1978 | Year_cr_line_1979 | Year_cr_line_1980 | Year_cr_line_1981 | Year_cr_line_1982 | Year_cr_line_1983 | Year_cr_line_1984 | Year_cr_line_1985 | Year_cr_line_1986 | Year_cr_line_1987 | Year_cr_line_1988 | Year_cr_line_1989 | Year_cr_line_1990 | Year_cr_line_1991 | Year_cr_line_1992 | Year_cr_line_1993 | Year_cr_line_1994 | Year_cr_line_1995 | Year_cr_line_1996 | Year_cr_line_1997 | Year_cr_line_1998 | Year_cr_line_1999 | Year_cr_line_2000 | Year_cr_line_2001 | Year_cr_line_2002 | Year_cr_line_2003 | Year_cr_line_2004 | Year_cr_line_2005 | Year_cr_line_2006 | Year_cr_line_2007 | Year_cr_line_2008 | Year_cr_line_2009 | Year_cr_line_2010 | Year_cr_line_2011 | Year_cr_line_2012 | Year_cr_line_2013 | Year_cr_line_2014 | Year_cr_line_2015 | Year_cr_line_2016 | Year_cr_line_2017 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 481659 | 27650.0 | 8.19 | 868.88 | 0 | A4 | Audit Consultant | 5 years | 90000.0 | 0 | Debt consolidation | 29.08 | 1997-04-01 | 12.0 | 0.0 | 14096.0 | 31.6 | 24.0 | w | 1.0 | 0.0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 369318 | 9000.0 | 10.81 | 293.84 | 0 | A5 | Facility Manager | 4 years | 53000.0 | 0 | Debt consolidation | 26.86 | 2013-01-01 | 29.0 | 0.0 | 18990.0 | 34.7 | 38.0 | w | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 173152 | 24000.0 | 8.19 | 488.82 | 0 | A4 | DSR | 10+ years | 94000.0 | 0 | Debt consolidation | 23.38 | 1993-12-01 | 11.0 | 0.0 | 21936.0 | 28.5 | 24.0 | w | 3.0 | 0.0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 173153 | 12500.0 | 6.49 | 383.06 | 0 | A2 | Marketing Manager | 5 years | 69500.0 | 0 | Debt consolidation | 14.50 | 2005-01-01 | 14.0 | 0.0 | 14163.0 | 32.5 | 28.0 | w | 1.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 902087 | 5600.0 | 7.89 | 175.20 | 0 | A5 | Legal Assistant | 9 years | 55000.0 | 1 | Major purchase | 13.11 | 1996-06-01 | 5.0 | 0.0 | 4771.0 | 71.2 | 21.0 | w | 2.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 418365 | 11525.0 | 28.99 | 365.76 | 6 | G5 | Document control | < 1 year | 28800.0 | 0 | Debt consolidation | 40.08 | 2007-09-01 | 14.0 | 0.0 | 14701.0 | 53.7 | 17.0 | f | 0.0 | 0.0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 802582 | 30000.0 | 25.80 | 894.67 | 6 | G1 | Director of clinic operations | 2 years | 120000.0 | 1 | Debt consolidation | 16.12 | 2001-09-01 | 14.0 | 0.0 | 18020.0 | 49.6 | 46.0 | f | 3.0 | 0.0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 130405 | 35000.0 | 30.79 | 1149.42 | 6 | G1 | operator | 10+ years | 80000.0 | 0 | Debt consolidation | 17.48 | 2006-09-01 | 9.0 | 0.0 | 950.0 | 25.0 | 22.0 | w | 5.0 | 0.0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 861165 | 21000.0 | 26.77 | 638.36 | 6 | G1 | DIRECTOR OF OPERATIONS | 2 years | 105000.0 | 1 | Vacation | 5.03 | 2008-03-01 | 13.0 | 0.0 | 3053.0 | 38.6 | 16.0 | f | 0.0 | 0.0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 409828 | 30000.0 | 25.80 | 894.67 | 6 | G1 | Data Systems Project Director | 3 years | 92851.2 | 0 | Debt consolidation | 12.73 | 2002-10-01 | 6.0 | 0.0 | 21961.0 | 88.2 | 13.0 | w | 4.0 | 0.0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1028864 rows × 210 columns
train_df8.select_dtypes(['int64','float64']).columns
Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'open_acc',
'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'mort_acc',
'pub_rec_bankruptcies'],
dtype='object')
As mentioned in Section 2a, not logical that borrower could borrow money with annual income = 0. Hence, to impute using median
Additionally, the annual income is extremely right-skewed with a wide range, hence to execute log transform
train_df8[train_df8['annual_inc']==0]
| loan_amnt | int_rate | installment | grade | sub_grade | emp_title | emp_length | annual_inc | loan_status | title | dti | earliest_cr_line | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | mort_acc | pub_rec_bankruptcies | term_60 | home_ownership_Other | home_ownership_Own | home_ownership_Rent | verification_status_Verified | purpose_Credit_Card | purpose_Debt_Consolidation | purpose_Educational | purpose_Home_Improvement | purpose_House | purpose_Major_Purchase | purpose_Medical | purpose_Moving | purpose_Other | purpose_Renewable_Energy | purpose_Small_Business | purpose_Vacation | purpose_Wedding | zip_code_01 | zip_code_02 | zip_code_03 | zip_code_04 | zip_code_05 | zip_code_06 | zip_code_07 | zip_code_08 | zip_code_09 | zip_code_10 | zip_code_11 | zip_code_12 | zip_code_13 | zip_code_14 | zip_code_15 | zip_code_16 | zip_code_17 | zip_code_18 | zip_code_19 | zip_code_20 | zip_code_21 | zip_code_22 | zip_code_23 | zip_code_24 | zip_code_25 | zip_code_26 | zip_code_27 | zip_code_28 | zip_code_29 | zip_code_30 | zip_code_31 | zip_code_32 | zip_code_33 | zip_code_34 | zip_code_35 | zip_code_36 | zip_code_37 | zip_code_38 | zip_code_39 | zip_code_40 | zip_code_41 | zip_code_42 | zip_code_43 | zip_code_44 | zip_code_45 | zip_code_46 | zip_code_47 | zip_code_48 | zip_code_49 | zip_code_50 | zip_code_51 | zip_code_52 | zip_code_53 | zip_code_54 | zip_code_55 | zip_code_56 | zip_code_57 | zip_code_58 | zip_code_59 | zip_code_60 | zip_code_61 | zip_code_62 | zip_code_63 | zip_code_64 | zip_code_65 | zip_code_66 | zip_code_67 | zip_code_68 | zip_code_69 | zip_code_70 | zip_code_71 | zip_code_72 | zip_code_73 | zip_code_74 | zip_code_75 | zip_code_76 | zip_code_77 | zip_code_78 | zip_code_79 | zip_code_80 | zip_code_81 | zip_code_82 | zip_code_83 | zip_code_84 | zip_code_85 | zip_code_86 | zip_code_87 | zip_code_88 | zip_code_89 | zip_code_90 | zip_code_91 | zip_code_92 | zip_code_93 | zip_code_94 | zip_code_95 | zip_code_96 | zip_code_97 | zip_code_98 | zip_code_99 | application_type_Joint App | Year_cr_line_1944 | Year_cr_line_1946 | Year_cr_line_1948 | Year_cr_line_1949 | Year_cr_line_1950 | Year_cr_line_1951 | Year_cr_line_1952 | Year_cr_line_1953 | Year_cr_line_1954 | Year_cr_line_1955 | Year_cr_line_1956 | Year_cr_line_1957 | Year_cr_line_1958 | Year_cr_line_1959 | Year_cr_line_1960 | Year_cr_line_1961 | Year_cr_line_1962 | Year_cr_line_1963 | Year_cr_line_1964 | Year_cr_line_1965 | Year_cr_line_1966 | Year_cr_line_1967 | Year_cr_line_1968 | Year_cr_line_1969 | Year_cr_line_1970 | Year_cr_line_1971 | Year_cr_line_1972 | Year_cr_line_1973 | Year_cr_line_1974 | Year_cr_line_1975 | Year_cr_line_1976 | Year_cr_line_1977 | Year_cr_line_1978 | Year_cr_line_1979 | Year_cr_line_1980 | Year_cr_line_1981 | Year_cr_line_1982 | Year_cr_line_1983 | Year_cr_line_1984 | Year_cr_line_1985 | Year_cr_line_1986 | Year_cr_line_1987 | Year_cr_line_1988 | Year_cr_line_1989 | Year_cr_line_1990 | Year_cr_line_1991 | Year_cr_line_1992 | Year_cr_line_1993 | Year_cr_line_1994 | Year_cr_line_1995 | Year_cr_line_1996 | Year_cr_line_1997 | Year_cr_line_1998 | Year_cr_line_1999 | Year_cr_line_2000 | Year_cr_line_2001 | Year_cr_line_2002 | Year_cr_line_2003 | Year_cr_line_2004 | Year_cr_line_2005 | Year_cr_line_2006 | Year_cr_line_2007 | Year_cr_line_2008 | Year_cr_line_2009 | Year_cr_line_2010 | Year_cr_line_2011 | Year_cr_line_2012 | Year_cr_line_2013 | Year_cr_line_2014 | Year_cr_line_2015 | Year_cr_line_2016 | Year_cr_line_2017 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 507749 | 7000.0 | 6.71 | 215.22 | 0 | A3 | NaN | NaN | 0.0 | 0 | Medical expenses | 19.29648 | 1987-12-01 | 12.0 | 1.0 | 5451.0 | 16.1 | 26.0 | w | 0.0 | 1.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 368291 | 7300.0 | 6.11 | 222.45 | 0 | A1 | NaN | NaN | 0.0 | 0 | Credit card refinancing | 19.29648 | 2007-04-01 | 11.0 | 0.0 | 6608.0 | 13.2 | 11.0 | w | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 173656 | 23000.0 | 6.46 | 704.51 | 0 | A1 | NaN | NaN | 0.0 | 0 | Debt consolidation | 19.29648 | 2014-06-01 | 3.0 | 0.0 | 16274.0 | 51.7 | 4.0 | w | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 623024 | 20000.0 | 7.21 | 619.47 | 0 | A3 | NaN | NaN | 0.0 | 0 | Debt consolidation | 19.29648 | 2004-09-01 | 6.0 | 1.0 | 400.0 | 2.8 | 12.0 | w | 1.0 | 1.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 179870 | 22200.0 | 6.46 | 680.01 | 0 | A1 | NaN | NaN | 0.0 | 0 | Debt consolidation | 19.29648 | 2006-07-01 | 6.0 | 0.0 | 4479.0 | 18.1 | 12.0 | w | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 739220 | 24850.0 | 30.79 | 816.09 | 6 | G1 | NaN | NaN | 0.0 | 1 | Debt consolidation | 19.29648 | 2006-12-01 | 3.0 | 0.0 | 8134.0 | 90.4 | 12.0 | w | 0.0 | 0.0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 345029 | 35000.0 | 30.79 | 1149.42 | 6 | G1 | NaN | NaN | 0.0 | 0 | Debt consolidation | 19.29648 | 2007-10-01 | 5.0 | 0.0 | 11878.0 | 55.2 | 8.0 | w | 0.0 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 703101 | 10000.0 | 30.79 | 328.41 | 6 | G1 | NaN | NaN | 0.0 | 1 | Business | 19.29648 | 2007-05-01 | 8.0 | 0.0 | 4894.0 | 99.9 | 13.0 | w | 2.0 | 0.0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 226315 | 10000.0 | 30.79 | 328.41 | 6 | G1 | NaN | NaN | 0.0 | 0 | Other | 19.29648 | 2014-11-01 | 6.0 | 0.0 | 2398.0 | 28.5 | 9.0 | w | 0.0 | 0.0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 681917 | 30000.0 | 30.84 | 986.14 | 6 | G2 | NaN | NaN | 0.0 | 1 | Debt consolidation | 19.29648 | 2002-10-01 | 4.0 | 0.0 | 4071.0 | 33.9 | 15.0 | w | 1.0 | 0.0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
967 rows × 210 columns
train_df8['annual_inc'].replace(0, train_df8['annual_inc'].median(), inplace = True)
if len(train_df8[train_df8['annual_inc']==0]) == 0:
print("Imputation Successful")
else:
raise Exception("Imputation Unsucessful")
Imputation Successful
train_df8['annual_inc_log'] = (train_df8['annual_inc']).transform(np.log) # no need to +1 as 0 has been imputed
train_df8[['annual_inc_log', 'annual_inc']]
| annual_inc_log | annual_inc | |
|---|---|---|
| 481659 | 11.407565 | 90000.0 |
| 369318 | 10.878047 | 53000.0 |
| 173152 | 11.451050 | 94000.0 |
| 173153 | 11.149082 | 69500.0 |
| 902087 | 10.915088 | 55000.0 |
| ... | ... | ... |
| 418365 | 10.268131 | 28800.0 |
| 802582 | 11.695247 | 120000.0 |
| 130405 | 11.289782 | 80000.0 |
| 861165 | 11.561716 | 105000.0 |
| 409828 | 11.438753 | 92851.2 |
1028864 rows × 2 columns
total_acc is highly correlated with open_acc as shown in the heatmap under Section 2c. Hence, one of the columns has to be removed to minimise error arising from multicollinearity. As total_acc distribution is more than than open_acc, open_acc will be dropped.
train_df8.drop_duplicates(['total_acc', 'open_acc'])[['total_acc', 'open_acc']].sort_values(['total_acc', 'open_acc']).head(10)
| total_acc | open_acc | |
|---|---|---|
| 639629 | 1.0 | 1.0 |
| 343086 | 2.0 | 0.0 |
| 180202 | 2.0 | 1.0 |
| 173328 | 2.0 | 2.0 |
| 94050 | 3.0 | 0.0 |
| 184014 | 3.0 | 1.0 |
| 173611 | 3.0 | 2.0 |
| 173192 | 3.0 | 3.0 |
| 640516 | 3.0 | 4.0 |
| 253214 | 4.0 | 0.0 |
Similar to above, pub_rec_bankruptcies and pub_rec are highly correlated and similar distribution as mentioned in section 2d, pub_rec will be selected as derogatory records can be an early tell-tale signs to determine whether borrower will default
train_df8.drop_duplicates(['pub_rec_bankruptcies', 'pub_rec'])[['pub_rec', 'pub_rec_bankruptcies']].sort_values(['pub_rec', 'pub_rec_bankruptcies']).head(10)
| pub_rec | pub_rec_bankruptcies | |
|---|---|---|
| 481659 | 0.0 | 0.0 |
| 369405 | 1.0 | 0.0 |
| 173111 | 1.0 | 1.0 |
| 369826 | 2.0 | 0.0 |
| 170496 | 2.0 | 1.0 |
| 172988 | 2.0 | 2.0 |
| 371645 | 3.0 | 0.0 |
| 903320 | 3.0 | 1.0 |
| 374435 | 3.0 | 2.0 |
| 176749 | 3.0 | 3.0 |
To drop installment, for the same reason as above
train_df8.drop_duplicates(['loan_amnt', 'installment'])[['loan_amnt', 'installment']].sort_values(['loan_amnt', 'installment']).head(10)
| loan_amnt | installment | |
|---|---|---|
| 639908 | 500.0 | 15.67 |
| 639788 | 500.0 | 15.75 |
| 640501 | 500.0 | 15.76 |
| 640446 | 500.0 | 15.91 |
| 640404 | 500.0 | 16.73 |
| 640220 | 500.0 | 16.85 |
| 640150 | 550.0 | 18.28 |
| 640281 | 600.0 | 19.62 |
| 640367 | 600.0 | 19.80 |
| 639906 | 600.0 | 20.29 |
Similar to annual_inc, does not make sense to have 0 for dti.
As the dti data is skewed with range from 0.01 (ignoring 0) to 9999 (which may be an error), these outliers need to be removed. To only include data in the 1st to 99th percentile based on the report generated in Section 2 where the common values make up 99.3%
(train_df8['dti']==0).sum()
845
(train_df8['dti']==9999).sum()
0
lower_lim =train_df8['dti'].quantile(0.01)
upper_lim =train_df8['dti'].quantile(0.99)
train_df9 = train_df8[(train_df8['dti'] < upper_lim) & (train_df8['dti'] > lower_lim)]
print("max:", np.max(train_df8['dti']), "\nmin:", np.min(train_df8['dti']))
print('Number of Records Removed:', (len(train_df8)-len(train_df9)))
max: 999.0 min: -1.0 Number of Records Removed: 20613
Similar to annual_inc, does not make sense to have 0 for revol_bal.
As the revol_bal data is skewed with range from 1 (ignoring 0) to 1298783, these outliers need to be removed. To only include data in the 1st to 99th percentile based on the report generated in Section 2 where the common values make up 99.5%
lower_lim =train_df9['revol_bal'].quantile(0.01)
upper_lim =train_df9['revol_bal'].quantile(0.99)
train_df10 = train_df8[(train_df9['revol_bal'] < upper_lim) & (train_df8['revol_bal'] > lower_lim)]
print("max:", np.max(train_df10['revol_bal']), "\nmin:", np.min(train_df10['revol_bal']))
print('Number of Records Removed:', (len(train_df9)-len(train_df10)))
max: 95203.0 min: 165.0 Number of Records Removed: 20167
Similar to annual_inc, does not make sense to have 0 for revol_util.
As the revol_bal data is skewed with range from 0.01 (ignoring 0) to 892.3, these outliers need to be removed. To only include data in the 1st to 99th percentile based on the report generated in Section 2 where the common values make up 97.8%
lower_lim =train_df10['revol_util'].quantile(0.01)
upper_lim =train_df10['revol_util'].quantile(0.99)
train_df11 = train_df10[(train_df10['revol_util'] < upper_lim) & (train_df10['revol_util'] > lower_lim)]
print("max:", np.max(train_df11['revol_util']), "\nmin:", np.min(train_df11['revol_util']))
print('Number of Records Removed:', (len(train_df8)-len(train_df9)))
max: 98.1 min: 3.5 Number of Records Removed: 20613
train_df12 = train_df11.drop([ 'sub_grade', 'emp_title', 'emp_length', 'initial_list_status', 'title', 'earliest_cr_line'], axis =1)
train_df12
| loan_amnt | int_rate | installment | grade | annual_inc | loan_status | dti | open_acc | pub_rec | revol_bal | revol_util | total_acc | mort_acc | pub_rec_bankruptcies | term_60 | home_ownership_Other | home_ownership_Own | home_ownership_Rent | verification_status_Verified | purpose_Credit_Card | purpose_Debt_Consolidation | purpose_Educational | purpose_Home_Improvement | purpose_House | purpose_Major_Purchase | purpose_Medical | purpose_Moving | purpose_Other | purpose_Renewable_Energy | purpose_Small_Business | purpose_Vacation | purpose_Wedding | zip_code_01 | zip_code_02 | zip_code_03 | zip_code_04 | zip_code_05 | zip_code_06 | zip_code_07 | zip_code_08 | zip_code_09 | zip_code_10 | zip_code_11 | zip_code_12 | zip_code_13 | zip_code_14 | zip_code_15 | zip_code_16 | zip_code_17 | zip_code_18 | zip_code_19 | zip_code_20 | zip_code_21 | zip_code_22 | zip_code_23 | zip_code_24 | zip_code_25 | zip_code_26 | zip_code_27 | zip_code_28 | zip_code_29 | zip_code_30 | zip_code_31 | zip_code_32 | zip_code_33 | zip_code_34 | zip_code_35 | zip_code_36 | zip_code_37 | zip_code_38 | zip_code_39 | zip_code_40 | zip_code_41 | zip_code_42 | zip_code_43 | zip_code_44 | zip_code_45 | zip_code_46 | zip_code_47 | zip_code_48 | zip_code_49 | zip_code_50 | zip_code_51 | zip_code_52 | zip_code_53 | zip_code_54 | zip_code_55 | zip_code_56 | zip_code_57 | zip_code_58 | zip_code_59 | zip_code_60 | zip_code_61 | zip_code_62 | zip_code_63 | zip_code_64 | zip_code_65 | zip_code_66 | zip_code_67 | zip_code_68 | zip_code_69 | zip_code_70 | zip_code_71 | zip_code_72 | zip_code_73 | zip_code_74 | zip_code_75 | zip_code_76 | zip_code_77 | zip_code_78 | zip_code_79 | zip_code_80 | zip_code_81 | zip_code_82 | zip_code_83 | zip_code_84 | zip_code_85 | zip_code_86 | zip_code_87 | zip_code_88 | zip_code_89 | zip_code_90 | zip_code_91 | zip_code_92 | zip_code_93 | zip_code_94 | zip_code_95 | zip_code_96 | zip_code_97 | zip_code_98 | zip_code_99 | application_type_Joint App | Year_cr_line_1944 | Year_cr_line_1946 | Year_cr_line_1948 | Year_cr_line_1949 | Year_cr_line_1950 | Year_cr_line_1951 | Year_cr_line_1952 | Year_cr_line_1953 | Year_cr_line_1954 | Year_cr_line_1955 | Year_cr_line_1956 | Year_cr_line_1957 | Year_cr_line_1958 | Year_cr_line_1959 | Year_cr_line_1960 | Year_cr_line_1961 | Year_cr_line_1962 | Year_cr_line_1963 | Year_cr_line_1964 | Year_cr_line_1965 | Year_cr_line_1966 | Year_cr_line_1967 | Year_cr_line_1968 | Year_cr_line_1969 | Year_cr_line_1970 | Year_cr_line_1971 | Year_cr_line_1972 | Year_cr_line_1973 | Year_cr_line_1974 | Year_cr_line_1975 | Year_cr_line_1976 | Year_cr_line_1977 | Year_cr_line_1978 | Year_cr_line_1979 | Year_cr_line_1980 | Year_cr_line_1981 | Year_cr_line_1982 | Year_cr_line_1983 | Year_cr_line_1984 | Year_cr_line_1985 | Year_cr_line_1986 | Year_cr_line_1987 | Year_cr_line_1988 | Year_cr_line_1989 | Year_cr_line_1990 | Year_cr_line_1991 | Year_cr_line_1992 | Year_cr_line_1993 | Year_cr_line_1994 | Year_cr_line_1995 | Year_cr_line_1996 | Year_cr_line_1997 | Year_cr_line_1998 | Year_cr_line_1999 | Year_cr_line_2000 | Year_cr_line_2001 | Year_cr_line_2002 | Year_cr_line_2003 | Year_cr_line_2004 | Year_cr_line_2005 | Year_cr_line_2006 | Year_cr_line_2007 | Year_cr_line_2008 | Year_cr_line_2009 | Year_cr_line_2010 | Year_cr_line_2011 | Year_cr_line_2012 | Year_cr_line_2013 | Year_cr_line_2014 | Year_cr_line_2015 | Year_cr_line_2016 | Year_cr_line_2017 | annual_inc_log | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 481659 | 27650.0 | 8.19 | 868.88 | 0 | 90000.0 | 0 | 29.08 | 12.0 | 0.0 | 14096.0 | 31.6 | 24.0 | 1.0 | 0.0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.407565 |
| 369318 | 9000.0 | 10.81 | 293.84 | 0 | 53000.0 | 0 | 26.86 | 29.0 | 0.0 | 18990.0 | 34.7 | 38.0 | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 10.878047 |
| 173152 | 24000.0 | 8.19 | 488.82 | 0 | 94000.0 | 0 | 23.38 | 11.0 | 0.0 | 21936.0 | 28.5 | 24.0 | 3.0 | 0.0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.451050 |
| 173153 | 12500.0 | 6.49 | 383.06 | 0 | 69500.0 | 0 | 14.50 | 14.0 | 0.0 | 14163.0 | 32.5 | 28.0 | 1.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.149082 |
| 902087 | 5600.0 | 7.89 | 175.20 | 0 | 55000.0 | 1 | 13.11 | 5.0 | 0.0 | 4771.0 | 71.2 | 21.0 | 2.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.915088 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 418365 | 11525.0 | 28.99 | 365.76 | 6 | 28800.0 | 0 | 40.08 | 14.0 | 0.0 | 14701.0 | 53.7 | 17.0 | 0.0 | 0.0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.268131 |
| 802582 | 30000.0 | 25.80 | 894.67 | 6 | 120000.0 | 1 | 16.12 | 14.0 | 0.0 | 18020.0 | 49.6 | 46.0 | 3.0 | 0.0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.695247 |
| 130405 | 35000.0 | 30.79 | 1149.42 | 6 | 80000.0 | 0 | 17.48 | 9.0 | 0.0 | 950.0 | 25.0 | 22.0 | 5.0 | 0.0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.289782 |
| 861165 | 21000.0 | 26.77 | 638.36 | 6 | 105000.0 | 1 | 5.03 | 13.0 | 0.0 | 3053.0 | 38.6 | 16.0 | 0.0 | 0.0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.561716 |
| 409828 | 30000.0 | 25.80 | 894.67 | 6 | 92851.2 | 0 | 12.73 | 6.0 | 0.0 | 21961.0 | 88.2 | 13.0 | 4.0 | 0.0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.438753 |
967893 rows × 205 columns
OVERALL GOAL:
- Building a machine learning model that is capable of generating predictions
Treat Imbalanced Data
del [train_df, train_df2, train_df3, train_df4, train_df5, train_df6, train_df7, train_df8]
feat_selection = train_df12.columns.tolist()
feat_selection.remove('loan_status')
# from imblearn.over_sampling import SMOTE
# from sklearn.utils import shuffle
# train_df9 = shuffle(train_df9)
# sm = SMOTE(sampling_strategy='minority', random_state= 1)
# X_sm, y_sm = sm.fit_resample(train_df9[feat_selection], train_df9['loan_status'])
# train_df10 = pd.concat([pd.DataFrame(y_sm), pd.DataFrame(X_sm)], axis = 1)
# from imblearn.under_sampling import NearMiss
# undersample = NearMiss(version = 1, n_neighbors = 3)
# X_us, y_us = undersample.fit_resample(train_df9[feat_selection], train_df9['loan_status'])
# train_df10 = pd.concat([pd.DataFrame(y_us), pd.DataFrame(X_us)], axis = 1)
from sklearn.utils import shuffle
train_df13 = shuffle(train_df12, random_state = 1).reset_index()
train_df13
| index | loan_amnt | int_rate | installment | grade | annual_inc | loan_status | dti | open_acc | pub_rec | revol_bal | revol_util | total_acc | mort_acc | pub_rec_bankruptcies | term_60 | home_ownership_Other | home_ownership_Own | home_ownership_Rent | verification_status_Verified | purpose_Credit_Card | purpose_Debt_Consolidation | purpose_Educational | purpose_Home_Improvement | purpose_House | purpose_Major_Purchase | purpose_Medical | purpose_Moving | purpose_Other | purpose_Renewable_Energy | purpose_Small_Business | purpose_Vacation | purpose_Wedding | zip_code_01 | zip_code_02 | zip_code_03 | zip_code_04 | zip_code_05 | zip_code_06 | zip_code_07 | zip_code_08 | zip_code_09 | zip_code_10 | zip_code_11 | zip_code_12 | zip_code_13 | zip_code_14 | zip_code_15 | zip_code_16 | zip_code_17 | zip_code_18 | zip_code_19 | zip_code_20 | zip_code_21 | zip_code_22 | zip_code_23 | zip_code_24 | zip_code_25 | zip_code_26 | zip_code_27 | zip_code_28 | zip_code_29 | zip_code_30 | zip_code_31 | zip_code_32 | zip_code_33 | zip_code_34 | zip_code_35 | zip_code_36 | zip_code_37 | zip_code_38 | zip_code_39 | zip_code_40 | zip_code_41 | zip_code_42 | zip_code_43 | zip_code_44 | zip_code_45 | zip_code_46 | zip_code_47 | zip_code_48 | zip_code_49 | zip_code_50 | zip_code_51 | zip_code_52 | zip_code_53 | zip_code_54 | zip_code_55 | zip_code_56 | zip_code_57 | zip_code_58 | zip_code_59 | zip_code_60 | zip_code_61 | zip_code_62 | zip_code_63 | zip_code_64 | zip_code_65 | zip_code_66 | zip_code_67 | zip_code_68 | zip_code_69 | zip_code_70 | zip_code_71 | zip_code_72 | zip_code_73 | zip_code_74 | zip_code_75 | zip_code_76 | zip_code_77 | zip_code_78 | zip_code_79 | zip_code_80 | zip_code_81 | zip_code_82 | zip_code_83 | zip_code_84 | zip_code_85 | zip_code_86 | zip_code_87 | zip_code_88 | zip_code_89 | zip_code_90 | zip_code_91 | zip_code_92 | zip_code_93 | zip_code_94 | zip_code_95 | zip_code_96 | zip_code_97 | zip_code_98 | zip_code_99 | application_type_Joint App | Year_cr_line_1944 | Year_cr_line_1946 | Year_cr_line_1948 | Year_cr_line_1949 | Year_cr_line_1950 | Year_cr_line_1951 | Year_cr_line_1952 | Year_cr_line_1953 | Year_cr_line_1954 | Year_cr_line_1955 | Year_cr_line_1956 | Year_cr_line_1957 | Year_cr_line_1958 | Year_cr_line_1959 | Year_cr_line_1960 | Year_cr_line_1961 | Year_cr_line_1962 | Year_cr_line_1963 | Year_cr_line_1964 | Year_cr_line_1965 | Year_cr_line_1966 | Year_cr_line_1967 | Year_cr_line_1968 | Year_cr_line_1969 | Year_cr_line_1970 | Year_cr_line_1971 | Year_cr_line_1972 | Year_cr_line_1973 | Year_cr_line_1974 | Year_cr_line_1975 | Year_cr_line_1976 | Year_cr_line_1977 | Year_cr_line_1978 | Year_cr_line_1979 | Year_cr_line_1980 | Year_cr_line_1981 | Year_cr_line_1982 | Year_cr_line_1983 | Year_cr_line_1984 | Year_cr_line_1985 | Year_cr_line_1986 | Year_cr_line_1987 | Year_cr_line_1988 | Year_cr_line_1989 | Year_cr_line_1990 | Year_cr_line_1991 | Year_cr_line_1992 | Year_cr_line_1993 | Year_cr_line_1994 | Year_cr_line_1995 | Year_cr_line_1996 | Year_cr_line_1997 | Year_cr_line_1998 | Year_cr_line_1999 | Year_cr_line_2000 | Year_cr_line_2001 | Year_cr_line_2002 | Year_cr_line_2003 | Year_cr_line_2004 | Year_cr_line_2005 | Year_cr_line_2006 | Year_cr_line_2007 | Year_cr_line_2008 | Year_cr_line_2009 | Year_cr_line_2010 | Year_cr_line_2011 | Year_cr_line_2012 | Year_cr_line_2013 | Year_cr_line_2014 | Year_cr_line_2015 | Year_cr_line_2016 | Year_cr_line_2017 | annual_inc_log | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 708815 | 2600.0 | 10.41 | 84.40 | 1 | 40000.00 | 1 | 9.39 | 6.0 | 4.0 | 2146.0 | 52.3 | 32.0 | 1.0 | 1.0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.596635 |
| 1 | 236060 | 16000.0 | 14.09 | 547.55 | 1 | 62000.00 | 0 | 32.18 | 5.0 | 0.0 | 11896.0 | 67.2 | 18.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.034890 |
| 2 | 292369 | 15000.0 | 14.74 | 354.81 | 2 | 38000.00 | 0 | 16.84 | 7.0 | 0.0 | 15927.0 | 58.1 | 10.0 | 0.0 | 0.0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.545341 |
| 3 | 160815 | 3500.0 | 16.95 | 124.70 | 2 | 61000.00 | 0 | 25.65 | 10.0 | 0.0 | 14464.0 | 60.0 | 22.0 | 3.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.018629 |
| 4 | 691072 | 16000.0 | 13.59 | 543.67 | 2 | 90000.00 | 1 | 16.99 | 18.0 | 0.0 | 15440.0 | 54.6 | 27.0 | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.407565 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 967888 | 1012275 | 12000.0 | 14.08 | 410.60 | 2 | 48100.00 | 1 | 18.34 | 9.0 | 0.0 | 11940.0 | 56.3 | 24.0 | 1.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.781037 |
| 967889 | 489905 | 35000.0 | 16.55 | 1240.03 | 3 | 75000.00 | 0 | 20.73 | 8.0 | 0.0 | 5446.0 | 93.9 | 20.0 | 5.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.225243 |
| 967890 | 867007 | 14000.0 | 13.33 | 473.95 | 2 | 102000.00 | 1 | 22.89 | 15.0 | 0.0 | 24866.0 | 67.6 | 40.0 | 3.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.532728 |
| 967891 | 1019304 | 8950.0 | 14.08 | 306.24 | 2 | 30000.00 | 1 | 14.92 | 7.0 | 0.0 | 12537.0 | 80.0 | 8.0 | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.308953 |
| 967892 | 457634 | 5000.0 | 7.21 | 154.87 | 0 | 49358.39 | 0 | 23.68 | 11.0 | 0.0 | 2793.0 | 6.8 | 15.0 | 2.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.806863 |
967893 rows × 206 columns
Import the models of your choice
We select the features we want to use in predicting our outcome
reduce_memory_usage(train_df13, verbose=True)
Mem. usage decreased to 215.07 Mb (25.8% reduction)
| index | loan_amnt | int_rate | installment | grade | annual_inc | loan_status | dti | open_acc | pub_rec | revol_bal | revol_util | total_acc | mort_acc | pub_rec_bankruptcies | term_60 | home_ownership_Other | home_ownership_Own | home_ownership_Rent | verification_status_Verified | purpose_Credit_Card | purpose_Debt_Consolidation | purpose_Educational | purpose_Home_Improvement | purpose_House | purpose_Major_Purchase | purpose_Medical | purpose_Moving | purpose_Other | purpose_Renewable_Energy | purpose_Small_Business | purpose_Vacation | purpose_Wedding | zip_code_01 | zip_code_02 | zip_code_03 | zip_code_04 | zip_code_05 | zip_code_06 | zip_code_07 | zip_code_08 | zip_code_09 | zip_code_10 | zip_code_11 | zip_code_12 | zip_code_13 | zip_code_14 | zip_code_15 | zip_code_16 | zip_code_17 | zip_code_18 | zip_code_19 | zip_code_20 | zip_code_21 | zip_code_22 | zip_code_23 | zip_code_24 | zip_code_25 | zip_code_26 | zip_code_27 | zip_code_28 | zip_code_29 | zip_code_30 | zip_code_31 | zip_code_32 | zip_code_33 | zip_code_34 | zip_code_35 | zip_code_36 | zip_code_37 | zip_code_38 | zip_code_39 | zip_code_40 | zip_code_41 | zip_code_42 | zip_code_43 | zip_code_44 | zip_code_45 | zip_code_46 | zip_code_47 | zip_code_48 | zip_code_49 | zip_code_50 | zip_code_51 | zip_code_52 | zip_code_53 | zip_code_54 | zip_code_55 | zip_code_56 | zip_code_57 | zip_code_58 | zip_code_59 | zip_code_60 | zip_code_61 | zip_code_62 | zip_code_63 | zip_code_64 | zip_code_65 | zip_code_66 | zip_code_67 | zip_code_68 | zip_code_69 | zip_code_70 | zip_code_71 | zip_code_72 | zip_code_73 | zip_code_74 | zip_code_75 | zip_code_76 | zip_code_77 | zip_code_78 | zip_code_79 | zip_code_80 | zip_code_81 | zip_code_82 | zip_code_83 | zip_code_84 | zip_code_85 | zip_code_86 | zip_code_87 | zip_code_88 | zip_code_89 | zip_code_90 | zip_code_91 | zip_code_92 | zip_code_93 | zip_code_94 | zip_code_95 | zip_code_96 | zip_code_97 | zip_code_98 | zip_code_99 | application_type_Joint App | Year_cr_line_1944 | Year_cr_line_1946 | Year_cr_line_1948 | Year_cr_line_1949 | Year_cr_line_1950 | Year_cr_line_1951 | Year_cr_line_1952 | Year_cr_line_1953 | Year_cr_line_1954 | Year_cr_line_1955 | Year_cr_line_1956 | Year_cr_line_1957 | Year_cr_line_1958 | Year_cr_line_1959 | Year_cr_line_1960 | Year_cr_line_1961 | Year_cr_line_1962 | Year_cr_line_1963 | Year_cr_line_1964 | Year_cr_line_1965 | Year_cr_line_1966 | Year_cr_line_1967 | Year_cr_line_1968 | Year_cr_line_1969 | Year_cr_line_1970 | Year_cr_line_1971 | Year_cr_line_1972 | Year_cr_line_1973 | Year_cr_line_1974 | Year_cr_line_1975 | Year_cr_line_1976 | Year_cr_line_1977 | Year_cr_line_1978 | Year_cr_line_1979 | Year_cr_line_1980 | Year_cr_line_1981 | Year_cr_line_1982 | Year_cr_line_1983 | Year_cr_line_1984 | Year_cr_line_1985 | Year_cr_line_1986 | Year_cr_line_1987 | Year_cr_line_1988 | Year_cr_line_1989 | Year_cr_line_1990 | Year_cr_line_1991 | Year_cr_line_1992 | Year_cr_line_1993 | Year_cr_line_1994 | Year_cr_line_1995 | Year_cr_line_1996 | Year_cr_line_1997 | Year_cr_line_1998 | Year_cr_line_1999 | Year_cr_line_2000 | Year_cr_line_2001 | Year_cr_line_2002 | Year_cr_line_2003 | Year_cr_line_2004 | Year_cr_line_2005 | Year_cr_line_2006 | Year_cr_line_2007 | Year_cr_line_2008 | Year_cr_line_2009 | Year_cr_line_2010 | Year_cr_line_2011 | Year_cr_line_2012 | Year_cr_line_2013 | Year_cr_line_2014 | Year_cr_line_2015 | Year_cr_line_2016 | Year_cr_line_2017 | annual_inc_log | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 708815 | 2600.0 | 10.406250 | 84.3750 | 1 | 40000.000000 | 1 | 9.390625 | 6.0 | 4.0 | 2146.0 | 52.312500 | 32.0 | 1.0 | 1.0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.593750 |
| 1 | 236060 | 16000.0 | 14.093750 | 547.5000 | 1 | 62000.000000 | 0 | 32.187500 | 5.0 | 0.0 | 11896.0 | 67.187500 | 18.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.031250 |
| 2 | 292369 | 15000.0 | 14.742188 | 354.7500 | 2 | 38000.000000 | 0 | 16.843750 | 7.0 | 0.0 | 15927.0 | 58.093750 | 10.0 | 0.0 | 0.0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.546875 |
| 3 | 160815 | 3500.0 | 16.953125 | 124.6875 | 2 | 61000.000000 | 0 | 25.656250 | 10.0 | 0.0 | 14464.0 | 60.000000 | 22.0 | 3.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.015625 |
| 4 | 691072 | 16000.0 | 13.593750 | 543.5000 | 2 | 90000.000000 | 1 | 16.984375 | 18.0 | 0.0 | 15440.0 | 54.593750 | 27.0 | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.406250 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 967888 | 1012275 | 12000.0 | 14.078125 | 410.5000 | 2 | 48100.000000 | 1 | 18.343750 | 9.0 | 0.0 | 11940.0 | 56.312500 | 24.0 | 1.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.781250 |
| 967889 | 489905 | 35008.0 | 16.546875 | 1240.0000 | 3 | 75000.000000 | 0 | 20.734375 | 8.0 | 0.0 | 5446.0 | 93.875000 | 20.0 | 5.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.226562 |
| 967890 | 867007 | 14000.0 | 13.328125 | 474.0000 | 2 | 102000.000000 | 1 | 22.890625 | 15.0 | 0.0 | 24866.0 | 67.625000 | 40.0 | 3.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.531250 |
| 967891 | 1019304 | 8952.0 | 14.078125 | 306.2500 | 2 | 30000.000000 | 1 | 14.921875 | 7.0 | 0.0 | 12537.0 | 80.000000 | 8.0 | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.312500 |
| 967892 | 457634 | 5000.0 | 7.210938 | 154.8750 | 0 | 49358.390625 | 0 | 23.687500 | 11.0 | 0.0 | 2793.0 | 6.800781 | 15.0 | 2.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.804688 |
967893 rows × 206 columns
Train-Test Split</br> We split the data to facilitate the evaluation of the model
X = train_df13[feat_selection] # Select the features you want to use to predict the loan_status
y = train_df13['loan_status'].astype(int)
logReg = LogisticRegression(random_state=0, class_weight='balanced')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
logReg = logReg.fit(X_train, y_train)
y_pred = logReg.predict(X_test)
y_pred_proba = logReg.predict_proba(X_test)[:,1]
Confusion Matrix
metrics.confusion_matrix(y_test, y_pred)
array([[117425, 60595],
[ 39985, 72363]], dtype=int64)
print("Recall", metrics.recall_score(y_test, y_pred))
print("Precision", metrics.precision_score(y_test, y_pred))
print("F1 Score", metrics.f1_score(y_test, y_pred))
Recall 0.6440969131626731 Precision 0.5442545766332225 F1 Score 0.5899814925032408
We evaluate the model's AUC using metrics.roc_auc_score()
print('AUC:', metrics.roc_auc_score(y_test, y_pred_proba))
AUC: 0.7100509095839154
RFC on Imablanced Data: https://medium.com/sfu-cspmp/surviving-in-a-random-forest-with-imbalanced-datasets-b98b963d52eb
rfc = BalancedRandomForestClassifier(random_state = 5)
param = {
'n_estimators' : [100, 200],
'max_depth' : [2,4,8, None],
'max_features' : ['sqrt', 'log2', None]
}
rfc_grid = RandomizedSearchCV(rfc, param_distributions = param, n_jobs = -1, scoring = 'recall')
rfc_grid.fit(X_train, y_train)
y_pred_proba= rfc_grid.predict_proba(X_test)[:,1]
y_pred = rfc_grid.predict(X_test)
print("Recall", metrics.recall_score(y_test, y_pred))
print("Precision", metrics.precision_score(y_test, y_pred))
print("F1 Score", metrics.f1_score(y_test, y_pred))
Recall 0.7177697867340762 Precision 0.5085739872982638 F1 Score 0.5953290588352546
XGB on Imbalanced Data: https://medium.com/@rithpansanga/xgboost-and-imbalanced-datasets-strategies-for-handling-class-imbalance-cdd810b3905c
pos_weight = y_train.value_counts()[0]/y_train.value_counts()[1]
pos_weight
1.579937017588619
xgboost = xgb.XGBClassifier(booster = 'gbtree', random_state = 5, scale_pos_weight= pos_weight)
xgboost.fit(X_train, y_train)
y_pred_proba= xgboost.predict_proba(X_test)[:,1]
y_pred = xgboost.predict(X_test)
print("Recall", metrics.recall_score(y_test, y_pred))
print("Precision", metrics.precision_score(y_test, y_pred))
print("F1 Score", metrics.f1_score(y_test, y_pred))
Recall 0.7058247587852031 Precision 0.5524414627179691 F1 Score 0.619784359624681
xgboost = xgb.XGBClassifier(booster = 'gbtree', random_state = 5, scale_pos_weight= pos_weight)
param = {'learning_rate': [0.1, 0.3, 0.5], 'max_depth' : [2, 4, 8]}
xgboost_grid = RandomizedSearchCV(xgboost, param_distributions = param, n_iter = 3, n_jobs = -1, scoring = 'recall')
xgboost_grid.fit(X_train, y_train)
y_pred_proba= xgboost_grid.predict_proba(X_test)[:,1]
y_pred = xgboost_grid.predict(X_test)
print("Recall", metrics.recall_score(y_test, y_pred))
print("Precision", metrics.precision_score(y_test, y_pred))
print("F1 Score", metrics.f1_score(y_test, y_pred))
Recall 0.7068750667568626 Precision 0.5530939861406136 F1 Score 0.6205999226360024
feat_importances = pd.Series(xgboost.feature_importances_, index=X.columns).sort_values(ascending = False)
fig = px.bar(feat_importances)
fig.show()
perceptron = MLPClassifier()
param = {
'alpha': [0.0001, 0.001],
'learning_rate_init' : [0.1, 0.2, 0.3, 0.4, 0.5],
'max_iter': [500, 1000],
'tol' : [0.001, 0.005]
}
perceptron_grid = GridSearchCV(perceptron, param_grid = param, scoring = 'recall')
perceptron_grid.fit(X_train, y_train.ravel())
y_pred_proba= perceptron.predict_proba(X_test)[:,1]
y_predict = perceptron.predict(X_test)
print("Recall", metrics.recall_score(y_test, y_pred))
print("Precision", metrics.precision_score(y_test, y_pred))
print("F1 Score", metrics.f1_score(y_test, y_pred))
# from sklearn.neural_network import MLPClassifier
# alpha = 0
# learing_rate = 0
# max_iteration = 0
# tol = 0
# n_iter = 0
# bestAUC = 0
# acc = 0
# print('Alpha | Learning Rate | Max Iteration | Tolerance | AUC | Accuracy')
# for a in [0.0001, 0.001]:
# for learning_rate in range(1,5):
# for max_iteration in [500, 1000]:
# for tol in [0.001, 0.005]:
# perceptron = MLPClassifier(solver = 'adam', alpha = a,
# learning_rate_init = (learning_rate/10), max_iter = max_iteration,
# tol = tol, random_state = 5)
# perceptron.fit(X_train, y_train.ravel())
# y_pred_proba= perceptron.predict_proba(X_test)[:,1]
# y_predict = perceptron.predict(X_test)
# auc_score = metrics.roc_auc_score(y_test, y_pred_proba)
# accuracy = metrics.accuracy_score(y_test, y_predict)
# print(a, learning_rate/10, max_iteration, tol, round(auc_score,4), round(accuracy, 4))
# if auc_score > bestAUC:
# bestAUC = round(auc_score,4)
# alpha = a
# learing_rate = learning_rate/10
# max_iteration = max_iteration
# tol = tol
# acc = round(accuracy,4)
# else:
# pass
# print(alpha, "|", learning_rate, "|", max_iteration, "|", tol, "|", bestAUC, "|", acc)